From 7d3ee74a71f3314aca15cfab9df711cf72808bb3 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Thu, 2 Jan 2025 19:56:04 +1000 Subject: [PATCH] Aarch64 ASM: Use CPU features for more AES GCM streaming - fix GHASH_ONE_BLOCK to use CPU feature information. AES-GCM uses EOR3 (SHA-3 instruction) - split assembly code. Kyber uses SQRDMLSH - split assembly code. Changed define from WOLFSSL_AARCH64_NO_SQRMLSH to WOLFSSL_AARCH64_NO_SQRDMLSH to match instruction. Improved array data format for inline assembly code. --- configure.ac | 2 +- wolfcrypt/src/aes.c | 107 +- wolfcrypt/src/cpuid.c | 2 +- wolfcrypt/src/port/arm/armv8-aes.c | 13275 +++++++++++++++--- wolfcrypt/src/port/arm/armv8-kyber-asm.S | 3419 ++++- wolfcrypt/src/port/arm/armv8-kyber-asm_c.c | 9549 +++++-------- wolfcrypt/src/port/arm/armv8-sha3-asm_c.c | 72 +- wolfcrypt/src/port/arm/armv8-sha512-asm_c.c | 243 +- wolfcrypt/src/wc_kyber_poly.c | 151 +- wolfssl/wolfcrypt/aes.h | 4 +- wolfssl/wolfcrypt/wc_kyber.h | 3 + 11 files changed, 17613 insertions(+), 9214 deletions(-) diff --git a/configure.ac b/configure.ac index ce36e38ebd..cbb9a339f3 100644 --- a/configure.ac +++ b/configure.ac @@ -3125,7 +3125,7 @@ then AM_CPPFLAGS="$AM_CPPFLAGS+sm4" fi else - AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto -DWOLFSSL_AARCH64_NO_SQRMLSH" + AM_CPPFLAGS="$AM_CPPFLAGS -mcpu=generic+crypto -DWOLFSSL_AARCH64_NO_SQRDMLSH" fi ;; esac diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index cf500649e5..35b8f12641 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -805,6 +805,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits aes->use_aes_hw_crypto = IS_AARCH64_AES(cpuid_flags); #ifdef HAVE_AESGCM aes->use_pmull_hw_crypto = IS_AARCH64_PMULL(cpuid_flags); + aes->use_sha3_hw_crypto = IS_AARCH64_SHA3(cpuid_flags); #endif } @@ -6448,6 +6449,22 @@ static WC_INLINE void IncCtr(byte* ctr, word32 ctrSz) #define AES_LASTGBLOCK(aes) ((aes)->streamData + 3 * WC_AES_BLOCK_SIZE) /* Access last encrypted block. */ #define AES_LASTBLOCK(aes) ((aes)->streamData + 4 * WC_AES_BLOCK_SIZE) + + #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ + !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + #define GHASH_ONE_BLOCK(aes, block) \ + do { \ + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { \ + GHASH_ONE_BLOCK_AARCH64(aes, block); \ + } \ + else { \ + GHASH_ONE_BLOCK_SW(aes, block); \ + } \ + } \ + while (0) + #else + #define GHASH_ONE_BLOCK GHASH_ONE_BLOCK_SW + #endif #endif #if defined(HAVE_COLDFIRE_SEC) @@ -6866,9 +6883,9 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ do { \ - xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ + xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ GMULT(AES_TAG(aes), aes->gcm.H); \ } \ while (0) @@ -7099,9 +7116,9 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ do { \ - xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ + xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ GMULT(AES_TAG(aes), aes->gcm.M0); \ } \ while (0) @@ -7392,8 +7409,6 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, */ #define GHASH_INIT_EXTRA(aes) WC_DO_NOTHING -#if !defined(__aarch64__) || !defined(WOLFSSL_ARMASM) || \ - defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) /* GHASH one block of data.. * * XOR block into tag and GMULT with H using pre-computed table. @@ -7401,13 +7416,12 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ do { \ - xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ + xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ GMULT(AES_TAG(aes), (aes)->gcm.M0); \ } \ while (0) -#endif #endif /* WOLFSSL_AESGCM_STREAM */ #elif defined(WORD64_AVAILABLE) && !defined(GCM_WORD32) @@ -7574,17 +7588,17 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ - do { \ - word64* x = (word64*)AES_TAG(aes); \ - word64* h = (word64*)aes->gcm.H; \ - word64 block64[2]; \ - XMEMCPY(block64, block, WC_AES_BLOCK_SIZE); \ - ByteReverseWords64(block64, block64, WC_AES_BLOCK_SIZE); \ - x[0] ^= block64[0]; \ - x[1] ^= block64[1]; \ - GMULT(x, h); \ - } \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ + do { \ + word64* x = (word64*)AES_TAG(aes); \ + word64* h = (word64*)aes->gcm.H; \ + word64 block64[2]; \ + XMEMCPY(block64, block, WC_AES_BLOCK_SIZE); \ + ByteReverseWords64(block64, block64, WC_AES_BLOCK_SIZE); \ + x[0] ^= block64[0]; \ + x[1] ^= block64[1]; \ + GMULT(x, h); \ + } \ while (0) #ifdef OPENSSL_EXTRA @@ -7609,7 +7623,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, x[0] ^= len[0]; \ x[1] ^= len[1]; \ GMULT(x, h); \ - ByteReverseWords64(x, x, WC_AES_BLOCK_SIZE); \ + ByteReverseWords64(x, x, WC_AES_BLOCK_SIZE); \ } \ while (0) #else @@ -7632,7 +7646,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, x[0] ^= len[0]; \ x[1] ^= len[1]; \ GMULT(x, h); \ - ByteReverseWords64(x, x, WC_AES_BLOCK_SIZE); \ + ByteReverseWords64(x, x, WC_AES_BLOCK_SIZE); \ } \ while (0) #endif @@ -7652,7 +7666,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ do { \ word64* x = (word64*)AES_TAG(aes); \ word64* h = (word64*)aes->gcm.H; \ @@ -7884,19 +7898,19 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ - do { \ - word32* x = (word32*)AES_TAG(aes); \ - word32* h = (word32*)aes->gcm.H; \ - word32 bigEnd[4]; \ - XMEMCPY(bigEnd, block, WC_AES_BLOCK_SIZE); \ - ByteReverseWords(bigEnd, bigEnd, WC_AES_BLOCK_SIZE); \ - x[0] ^= bigEnd[0]; \ - x[1] ^= bigEnd[1]; \ - x[2] ^= bigEnd[2]; \ - x[3] ^= bigEnd[3]; \ - GMULT(x, h); \ - } \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ + do { \ + word32* x = (word32*)AES_TAG(aes); \ + word32* h = (word32*)aes->gcm.H; \ + word32 bigEnd[4]; \ + XMEMCPY(bigEnd, block, WC_AES_BLOCK_SIZE); \ + ByteReverseWords(bigEnd, bigEnd, WC_AES_BLOCK_SIZE); \ + x[0] ^= bigEnd[0]; \ + x[1] ^= bigEnd[1]; \ + x[2] ^= bigEnd[2]; \ + x[3] ^= bigEnd[3]; \ + GMULT(x, h); \ + } \ while (0) /* GHASH in AAD and cipher text lengths in bits. @@ -7919,7 +7933,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, x[2] ^= len[2]; \ x[3] ^= len[3]; \ GMULT(x, h); \ - ByteReverseWords(x, x, WC_AES_BLOCK_SIZE); \ + ByteReverseWords(x, x, WC_AES_BLOCK_SIZE); \ } \ while (0) #else @@ -7936,12 +7950,12 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ +#define GHASH_ONE_BLOCK_SW(aes, block) \ do { \ word32* x = (word32*)AES_TAG(aes); \ word32* h = (word32*)aes->gcm.H; \ word32 block32[4]; \ - XMEMCPY(block32, block, WC_AES_BLOCK_SIZE); \ + XMEMCPY(block32, block, WC_AES_BLOCK_SIZE); \ x[0] ^= block32[0]; \ x[1] ^= block32[1]; \ x[2] ^= block32[2]; \ @@ -7985,7 +7999,7 @@ void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, */ #define GHASH_LEN_BLOCK(aes) \ do { \ - byte scratch[WC_AES_BLOCK_SIZE]; \ + byte scratch[WC_AES_BLOCK_SIZE]; \ FlattenSzInBits(&scratch[0], (aes)->aSz); \ FlattenSzInBits(&scratch[8], (aes)->cSz); \ GHASH_ONE_BLOCK(aes, scratch); \ @@ -8139,7 +8153,8 @@ static void GHASH_FINAL(Aes* aes, byte* s, word32 sSz) } if (over > 0) { /* Zeroize the unused part of the block. */ - XMEMSET(AES_LASTGBLOCK(aes) + over, 0, (size_t)WC_AES_BLOCK_SIZE - over); + XMEMSET(AES_LASTGBLOCK(aes) + over, 0, + (size_t)WC_AES_BLOCK_SIZE - over); /* Hash the last block of cipher text. */ GHASH_ONE_BLOCK(aes, AES_LASTGBLOCK(aes)); } @@ -10189,7 +10204,7 @@ int wc_AesGcmInit(Aes* aes, const byte* key, word32 len, const byte* iv, else #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { AES_GCM_init_AARCH64(aes, iv, ivSz); /* Reset state fields. */ @@ -10328,7 +10343,7 @@ int wc_AesGcmEncryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz, else #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { AES_GCM_crypt_update_AARCH64(aes, out, in, sz); GHASH_UPDATE_AARCH64(aes, authIn, authInSz, out, sz); } @@ -10388,7 +10403,7 @@ int wc_AesGcmEncryptFinal(Aes* aes, byte* authTag, word32 authTagSz) else #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { AES_GCM_final_AARCH64(aes, authTag, authTagSz); } else @@ -10477,7 +10492,7 @@ int wc_AesGcmDecryptUpdate(Aes* aes, byte* out, const byte* in, word32 sz, else #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { GHASH_UPDATE_AARCH64(aes, authIn, authInSz, in, sz); AES_GCM_crypt_update_AARCH64(aes, out, in, sz); } @@ -10535,7 +10550,7 @@ int wc_AesGcmDecryptFinal(Aes* aes, const byte* authTag, word32 authTagSz) else #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) - if (aes->use_aes_hw_crypto) { + if (aes->use_aes_hw_crypto && aes->use_pmull_hw_crypto) { ALIGN32 byte calcTag[WC_AES_BLOCK_SIZE]; AES_GCM_final_AARCH64(aes, calcTag, authTagSz); /* Check calculated tag matches the one passed in. */ diff --git a/wolfcrypt/src/cpuid.c b/wolfcrypt/src/cpuid.c index a9f15338cf..a91e343fb8 100644 --- a/wolfcrypt/src/cpuid.c +++ b/wolfcrypt/src/cpuid.c @@ -289,7 +289,7 @@ #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 cpuid_flags |= CPUID_SHA512; #endif - #ifndef WOLFSSL_AARCH64_NO_SQRMLSH + #ifndef WOLFSSL_AARCH64_NO_SQRDMLSH cpuid_flags |= CPUID_RDM; #endif #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index 10d5dac7fd..20063bbf67 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -1540,8 +1540,8 @@ void GMULT_AARCH64(byte* X, byte* Y) ); } -static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, - word32 cSz, byte* s, word32 sSz) +static void GHASH_AARCH64_EOR(Gcm* gcm, const byte* a, word32 aSz, + const byte* c, word32 cSz, byte* s, word32 sSz) { byte scratch[WC_AES_BLOCK_SIZE]; @@ -1616,12 +1616,8 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "PMULL v9.1q, v12.1d, v4.1d \n" "PMULL2 v12.1q, v12.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v2.16b, v2.16b, v12.16b, v9.16b \n" -#else "EOR v12.16b, v12.16b, v9.16b \n" "EOR v2.16b, v2.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v8.1q, v11.1d, v5.1d \n" "PMULL2 v9.1q, v11.2d, v5.2d \n" @@ -1630,12 +1626,8 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "EXT v11.16b, v11.16b, v11.16b, #8 \n" "PMULL v9.1q, v11.1d, v5.1d \n" "PMULL2 v11.1q, v11.2d, v5.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v2.16b, v2.16b, v11.16b, v9.16b \n" -#else "EOR v11.16b, v11.16b, v9.16b \n" "EOR v2.16b, v2.16b, v11.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v8.1q, v10.1d, v6.1d \n" "PMULL2 v9.1q, v10.2d, v6.2d \n" @@ -1644,23 +1636,13 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "EXT v10.16b, v10.16b, v10.16b, #8 \n" "PMULL v9.1q, v10.1d, v6.1d \n" "PMULL2 v10.1q, v10.2d, v6.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v2.16b, v2.16b, v10.16b, v9.16b \n" -#else "EOR v10.16b, v10.16b, v9.16b \n" "EOR v2.16b, v2.16b, v10.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v9.16b, v0.16b, v1.16b, #8 \n" "PMULL2 v8.1q, v1.2d, v7.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v9.16b, v9.16b, v2.16b, v8.16b \n" -#else "EOR v9.16b, v9.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v9.16b, v9.16b, v8.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v8.1q, v9.2d, v7.2d \n" "MOV v0.D[1], v9.D[0] \n" "EOR v0.16b, v0.16b, v8.16b \n" @@ -1787,12 +1769,8 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "PMULL v9.1q, v12.1d, v4.1d \n" "PMULL2 v12.1q, v12.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v2.16b, v2.16b, v12.16b, v9.16b \n" -#else "EOR v12.16b, v12.16b, v9.16b \n" "EOR v2.16b, v2.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v8.1q, v11.1d, v5.1d \n" "PMULL2 v9.1q, v11.2d, v5.2d \n" @@ -1801,12 +1779,8 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "EXT v11.16b, v11.16b, v11.16b, #8 \n" "PMULL v9.1q, v11.1d, v5.1d \n" "PMULL2 v11.1q, v11.2d, v5.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v2.16b, v2.16b, v11.16b, v9.16b \n" -#else "EOR v11.16b, v11.16b, v9.16b \n" "EOR v2.16b, v2.16b, v11.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v8.1q, v10.1d, v6.1d \n" "PMULL2 v9.1q, v10.2d, v6.2d \n" @@ -1815,23 +1789,13 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, "EXT v10.16b, v10.16b, v10.16b, #8 \n" "PMULL v9.1q, v10.1d, v6.1d \n" "PMULL2 v10.1q, v10.2d, v6.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v2.16b, v2.16b, v10.16b, v9.16b \n" -#else "EOR v10.16b, v10.16b, v9.16b \n" "EOR v2.16b, v2.16b, v10.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v9.16b, v0.16b, v1.16b, #8 \n" "PMULL2 v8.1q, v1.2d, v7.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v9.16b, v9.16b, v2.16b, v8.16b \n" -#else "EOR v9.16b, v9.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v9.16b, v9.16b, v8.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v8.1q, v9.2d, v7.2d \n" "MOV v0.D[1], v9.D[0] \n" "EOR v0.16b, v0.16b, v8.16b \n" @@ -1909,301 +1873,10499 @@ static void GHASH_AARCH64(Gcm* gcm, const byte* a, word32 aSz, const byte* c, XMEMCPY(s, scratch, sSz); } +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 -#ifdef WOLFSSL_AESGCM_STREAM - /* Access initialization counter data. */ - #define AES_INITCTR(aes) ((aes)->streamData + 0 * WC_AES_BLOCK_SIZE) - /* Access counter data. */ - #define AES_COUNTER(aes) ((aes)->streamData + 1 * WC_AES_BLOCK_SIZE) - /* Access tag data. */ - #define AES_TAG(aes) ((aes)->streamData + 2 * WC_AES_BLOCK_SIZE) - /* Access last GHASH block. */ - #define AES_LASTGBLOCK(aes) ((aes)->streamData + 3 * WC_AES_BLOCK_SIZE) - /* Access last encrypted block. */ - #define AES_LASTBLOCK(aes) ((aes)->streamData + 4 * WC_AES_BLOCK_SIZE) - -/* GHASH one block of data. - * - * XOR block into tag and GMULT with H. - * - * @param [in, out] aes AES GCM object. - * @param [in] block Block of AAD or cipher text. - */ -#define GHASH_ONE_BLOCK_AARCH64(aes, block) \ - do { \ - xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ - GMULT_AARCH64(AES_TAG(aes), aes->gcm.H); \ - } \ - while (0) - -/* Hash in the lengths of the AAD and cipher text in bits. - * - * Default implementation. - * - * @param [in, out] aes AES GCM object. - */ -#define GHASH_LEN_BLOCK_AARCH64(aes) \ - do { \ - byte scratch[WC_AES_BLOCK_SIZE]; \ - FlattenSzInBits(&scratch[0], aes->aSz); \ - FlattenSzInBits(&scratch[8], aes->cSz); \ - GHASH_ONE_BLOCK_AARCH64(aes, scratch); \ - } \ - while (0) - -/* Update the GHASH with AAD and/or cipher text. - * - * @param [in,out] aes AES GCM object. - * @param [in] a Additional authentication data buffer. - * @param [in] aSz Size of data in AAD buffer. - * @param [in] c Cipher text buffer. - * @param [in] cSz Size of data in cipher text buffer. - */ -void GHASH_UPDATE_AARCH64(Aes* aes, const byte* a, word32 aSz, const byte* c, - word32 cSz) -{ - word32 blocks; - word32 partial; - - /* Hash in A, the Additional Authentication Data */ - if (aSz != 0 && a != NULL) { - /* Update count of AAD we have hashed. */ - aes->aSz += aSz; - /* Check if we have unprocessed data. */ - if (aes->aOver > 0) { - /* Calculate amount we can use - fill up the block. */ - byte sz = WC_AES_BLOCK_SIZE - aes->aOver; - if (sz > aSz) { - sz = aSz; - } - /* Copy extra into last GHASH block array and update count. */ - XMEMCPY(AES_LASTGBLOCK(aes) + aes->aOver, a, sz); - aes->aOver += sz; - if (aes->aOver == WC_AES_BLOCK_SIZE) { - /* We have filled up the block and can process. */ - GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); - /* Reset count. */ - aes->aOver = 0; - } - /* Used up some data. */ - aSz -= sz; - a += sz; - } - - /* Calculate number of blocks of AAD and the leftover. */ - blocks = aSz / WC_AES_BLOCK_SIZE; - partial = aSz % WC_AES_BLOCK_SIZE; - /* GHASH full blocks now. */ - while (blocks--) { - GHASH_ONE_BLOCK_AARCH64(aes, a); - a += WC_AES_BLOCK_SIZE; - } - if (partial != 0) { - /* Cache the partial block. */ - XMEMCPY(AES_LASTGBLOCK(aes), a, partial); - aes->aOver = (byte)partial; - } - } - if (aes->aOver > 0 && cSz > 0 && c != NULL) { - /* No more AAD coming and we have a partial block. */ - /* Fill the rest of the block with zeros. */ - byte sz = WC_AES_BLOCK_SIZE - aes->aOver; - XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0, sz); - /* GHASH last AAD block. */ - GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); - /* Clear partial count for next time through. */ - aes->aOver = 0; - } - - /* Hash in C, the Ciphertext */ - if (cSz != 0 && c != NULL) { - /* Update count of cipher text we have hashed. */ - aes->cSz += cSz; - if (aes->cOver > 0) { - /* Calculate amount we can use - fill up the block. */ - byte sz = WC_AES_BLOCK_SIZE - aes->cOver; - if (sz > cSz) { - sz = cSz; - } - XMEMCPY(AES_LASTGBLOCK(aes) + aes->cOver, c, sz); - /* Update count of unused encrypted counter. */ - aes->cOver += sz; - if (aes->cOver == WC_AES_BLOCK_SIZE) { - /* We have filled up the block and can process. */ - GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); - /* Reset count. */ - aes->cOver = 0; - } - /* Used up some data. */ - cSz -= sz; - c += sz; - } - - /* Calculate number of blocks of cipher text and the leftover. */ - blocks = cSz / WC_AES_BLOCK_SIZE; - partial = cSz % WC_AES_BLOCK_SIZE; - /* GHASH full blocks now. */ - while (blocks--) { - GHASH_ONE_BLOCK_AARCH64(aes, c); - c += WC_AES_BLOCK_SIZE; - } - if (partial != 0) { - /* Cache the partial block. */ - XMEMCPY(AES_LASTGBLOCK(aes), c, partial); - aes->cOver = (byte)partial; - } - } -} - -/* Finalize the GHASH calculation. - * - * Complete hashing cipher text and hash the AAD and cipher text lengths. - * - * @param [in, out] aes AES GCM object. - * @param [out] s Authentication tag. - * @param [in] sSz Size of authentication tag required. - */ -static void GHASH_FINAL_AARCH64(Aes* aes, byte* s, word32 sSz) -{ - /* AAD block incomplete when > 0 */ - byte over = aes->aOver; - - if (aes->cOver > 0) { - /* Cipher text block incomplete. */ - over = aes->cOver; - } - if (over > 0) { - /* Zeroize the unused part of the block. */ - XMEMSET(AES_LASTGBLOCK(aes) + over, 0, WC_AES_BLOCK_SIZE - over); - /* Hash the last block of cipher text. */ - GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); - } - /* Hash in the lengths of AAD and cipher text in bits */ - GHASH_LEN_BLOCK_AARCH64(aes); - /* Copy the result into s. */ - XMEMCPY(s, AES_TAG(aes), sSz); -} - -void AES_GCM_init_AARCH64(Aes* aes, const byte* iv, word32 ivSz) +static void GHASH_AARCH64_EOR3(Gcm* gcm, const byte* a, word32 aSz, + const byte* c, word32 cSz, byte* s, word32 sSz) { - ALIGN32 byte counter[WC_AES_BLOCK_SIZE]; - - if (ivSz == GCM_NONCE_MID_SZ) { - /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */ - XMEMCPY(counter, iv, ivSz); - XMEMSET(counter + GCM_NONCE_MID_SZ, 0, - WC_AES_BLOCK_SIZE - GCM_NONCE_MID_SZ - 1); - counter[WC_AES_BLOCK_SIZE - 1] = 1; - } - else { - /* Counter is GHASH of IV. */ - #ifdef OPENSSL_EXTRA - word32 aadTemp = aes->gcm.aadLen; - aes->gcm.aadLen = 0; - #endif - GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); - GMULT_AARCH64(counter, aes->gcm.H); - #ifdef OPENSSL_EXTRA - aes->gcm.aadLen = aadTemp; - #endif - } + byte scratch[WC_AES_BLOCK_SIZE]; - /* Copy in the counter for use with cipher. */ - XMEMCPY(AES_COUNTER(aes), counter, WC_AES_BLOCK_SIZE); - /* Encrypt initial counter into a buffer for GCM. */ - AES_encrypt_AARCH64(counter, AES_INITCTR(aes), (byte*)aes->key, - (int)aes->rounds); -} + __asm__ __volatile__ ( + "LD1 {v3.16b}, %[h] \n" + "MOVI v7.16b, #0x87 \n" + "EOR v0.16b, v0.16b, v0.16b \n" + "USHR v7.2d, v7.2d, #56 \n" -void AES_GCM_crypt_update_AARCH64(Aes* aes, byte* out, const byte* in, - word32 sz) -{ - word32 blocks; - word32 partial; + "# AAD \n" + "CBZ %[a], 20f \n" + "CBZ %w[aSz], 20f \n" + "MOV w12, %w[aSz] \n" - /* Check if previous encrypted block was not used up. */ - if (aes->over > 0) { - byte pSz = WC_AES_BLOCK_SIZE - aes->over; + "CMP x12, #64 \n" + "BLT 15f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v11.1q, v3.2d, v3.2d \n" + "PMULL v10.1q, v3.1d, v3.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v4.16b, v10.16b, v11.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v10.1q, v4.1d, v3.1d \n" + "PMULL2 v11.1q, v4.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v4.1d, v12.1d \n" + "PMULL2 v12.1q, v4.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v5.16b, v10.16b, v12.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v11.1q, v4.2d, v4.2d \n" + "PMULL v10.1q, v4.1d, v4.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v6.16b, v10.16b, v11.16b \n" + "14: \n" + "LD1 {v10.2d-v13.2d}, [%[a]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v10.16b, v10.16b \n" + "RBIT v11.16b, v11.16b \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "EOR v10.16b, v10.16b, v0.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v0.1q, v13.1d, v3.1d \n" + "PMULL2 v1.1q, v13.2d, v3.2d \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v2.1q, v13.1d, v3.1d \n" + "PMULL2 v9.1q, v13.2d, v3.2d \n" + "EOR v2.16b, v2.16b, v9.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v8.1q, v12.1d, v4.1d \n" + "PMULL2 v9.1q, v12.2d, v4.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v9.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR3 v2.16b, v2.16b, v12.16b, v9.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v8.1q, v11.1d, v5.1d \n" + "PMULL2 v9.1q, v11.2d, v5.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v11.16b, v11.16b, v11.16b, #8 \n" + "PMULL v9.1q, v11.1d, v5.1d \n" + "PMULL2 v11.1q, v11.2d, v5.2d \n" + "EOR3 v2.16b, v2.16b, v11.16b, v9.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v8.1q, v10.1d, v6.1d \n" + "PMULL2 v9.1q, v10.2d, v6.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v10.16b, v10.16b, v10.16b, #8 \n" + "PMULL v9.1q, v10.1d, v6.1d \n" + "PMULL2 v10.1q, v10.2d, v6.2d \n" + "EOR3 v2.16b, v2.16b, v10.16b, v9.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v9.16b, v0.16b, v1.16b, #8 \n" + "PMULL2 v8.1q, v1.2d, v7.2d \n" + "EOR3 v9.16b, v9.16b, v2.16b, v8.16b \n" + "PMULL2 v8.1q, v9.2d, v7.2d \n" + "MOV v0.D[1], v9.D[0] \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "CMP x12, #64 \n" + "BGE 14b \n" + "CBZ x12, 20f \n" + "15: \n" + "CMP x12, #16 \n" + "BLT 12f \n" + "11: \n" + "LD1 {v14.2d}, [%[a]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + "CMP x12, #16 \n" + "BGE 11b \n" + "CBZ x12, 120f \n" + "12: \n" + "# Partial AAD \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "MOV x14, x12 \n" + "ST1 {v14.2d}, [%[scratch]] \n" + "13: \n" + "LDRB w13, [%[a]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 13b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v14.2d}, [%[scratch]] \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + + "20: \n" + "# Cipher Text \n" + "CBZ %[c], 120f \n" + "CBZ %w[cSz], 120f \n" + "MOV w12, %w[cSz] \n" + + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v11.1q, v3.2d, v3.2d \n" + "PMULL v10.1q, v3.1d, v3.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v4.16b, v10.16b, v11.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v10.1q, v4.1d, v3.1d \n" + "PMULL2 v11.1q, v4.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v4.1d, v12.1d \n" + "PMULL2 v12.1q, v4.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v5.16b, v10.16b, v12.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v11.1q, v4.2d, v4.2d \n" + "PMULL v10.1q, v4.1d, v4.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v6.16b, v10.16b, v11.16b \n" + "114: \n" + "LD1 {v10.2d-v13.2d}, [%[c]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v10.16b, v10.16b \n" + "RBIT v11.16b, v11.16b \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "EOR v10.16b, v10.16b, v0.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v0.1q, v13.1d, v3.1d \n" + "PMULL2 v1.1q, v13.2d, v3.2d \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v2.1q, v13.1d, v3.1d \n" + "PMULL2 v9.1q, v13.2d, v3.2d \n" + "EOR v2.16b, v2.16b, v9.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v8.1q, v12.1d, v4.1d \n" + "PMULL2 v9.1q, v12.2d, v4.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v9.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR3 v2.16b, v2.16b, v12.16b, v9.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v8.1q, v11.1d, v5.1d \n" + "PMULL2 v9.1q, v11.2d, v5.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v11.16b, v11.16b, v11.16b, #8 \n" + "PMULL v9.1q, v11.1d, v5.1d \n" + "PMULL2 v11.1q, v11.2d, v5.2d \n" + "EOR3 v2.16b, v2.16b, v11.16b, v9.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v8.1q, v10.1d, v6.1d \n" + "PMULL2 v9.1q, v10.2d, v6.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v10.16b, v10.16b, v10.16b, #8 \n" + "PMULL v9.1q, v10.1d, v6.1d \n" + "PMULL2 v10.1q, v10.2d, v6.2d \n" + "EOR3 v2.16b, v2.16b, v10.16b, v9.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v9.16b, v0.16b, v1.16b, #8 \n" + "PMULL2 v8.1q, v1.2d, v7.2d \n" + "EOR3 v9.16b, v9.16b, v2.16b, v8.16b \n" + "PMULL2 v8.1q, v9.2d, v7.2d \n" + "MOV v0.D[1], v9.D[0] \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v14.2d}, [%[c]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial cipher text \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "MOV x14, x12 \n" + "ST1 {v14.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[c]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v14.2d}, [%[scratch]] \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + "120: \n" + "RBIT v0.16b, v0.16b \n" + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[cSz], %x[cSz], #3 \n" + "MOV v10.D[0], %x[aSz] \n" + "MOV v10.D[1], %x[cSz] \n" + "REV64 v10.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v10.16b \n" + "ST1 {v0.16b}, [%[scratch]] \n" + : [cSz] "+r" (cSz), [c] "+r" (c), [aSz] "+r" (aSz), [a] "+r" (a) + : [scratch] "r" (scratch), [h] "m" (gcm->H) + : "cc", "memory", "w12", "w13", "x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); + + XMEMCPY(s, scratch, sSz); +} +#endif + +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + +#define GHASH_AARCH64(gcm, a, aSz, c, cSz, s, sSz) \ + do { \ + if (aes->use_sha3_hw_crypto) { \ + GHASH_AARCH64_EOR3(gcm, a, aSz, c, cSz, s, sSz); \ + } \ + else { \ + GHASH_AARCH64_EOR(gcm, a, aSz, c, cSz, s, sSz); \ + } \ + } \ + while (0) + +#else + + #define GHASH_AARCH64(gcm, a, aSz, c, cSz, s, sSz) \ + GHASH_AARCH64_EOR(gcm, a, aSz, c, cSz, s, sSz) + +#endif + +#ifdef WOLFSSL_AESGCM_STREAM + /* Access initialization counter data. */ + #define AES_INITCTR(aes) ((aes)->streamData + 0 * WC_AES_BLOCK_SIZE) + /* Access counter data. */ + #define AES_COUNTER(aes) ((aes)->streamData + 1 * WC_AES_BLOCK_SIZE) + /* Access tag data. */ + #define AES_TAG(aes) ((aes)->streamData + 2 * WC_AES_BLOCK_SIZE) + /* Access last GHASH block. */ + #define AES_LASTGBLOCK(aes) ((aes)->streamData + 3 * WC_AES_BLOCK_SIZE) + /* Access last encrypted block. */ + #define AES_LASTBLOCK(aes) ((aes)->streamData + 4 * WC_AES_BLOCK_SIZE) + +/* GHASH one block of data. + * + * XOR block into tag and GMULT with H. + * + * @param [in, out] aes AES GCM object. + * @param [in] block Block of AAD or cipher text. + */ +#define GHASH_ONE_BLOCK_AARCH64(aes, block) \ + do { \ + xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ + GMULT_AARCH64(AES_TAG(aes), aes->gcm.H); \ + } \ + while (0) + +/* Hash in the lengths of the AAD and cipher text in bits. + * + * Default implementation. + * + * @param [in, out] aes AES GCM object. + */ +#define GHASH_LEN_BLOCK_AARCH64(aes) \ + do { \ + byte scratch[WC_AES_BLOCK_SIZE]; \ + FlattenSzInBits(&scratch[0], aes->aSz); \ + FlattenSzInBits(&scratch[8], aes->cSz); \ + GHASH_ONE_BLOCK_AARCH64(aes, scratch); \ + } \ + while (0) + +/* Update the GHASH with AAD and/or cipher text. + * + * @param [in,out] aes AES GCM object. + * @param [in] a Additional authentication data buffer. + * @param [in] aSz Size of data in AAD buffer. + * @param [in] c Cipher text buffer. + * @param [in] cSz Size of data in cipher text buffer. + */ +void GHASH_UPDATE_AARCH64(Aes* aes, const byte* a, word32 aSz, const byte* c, + word32 cSz) +{ + word32 blocks; + word32 partial; + + /* Hash in A, the Additional Authentication Data */ + if (aSz != 0 && a != NULL) { + /* Update count of AAD we have hashed. */ + aes->aSz += aSz; + /* Check if we have unprocessed data. */ + if (aes->aOver > 0) { + /* Calculate amount we can use - fill up the block. */ + byte sz = WC_AES_BLOCK_SIZE - aes->aOver; + if (sz > aSz) { + sz = aSz; + } + /* Copy extra into last GHASH block array and update count. */ + XMEMCPY(AES_LASTGBLOCK(aes) + aes->aOver, a, sz); + aes->aOver += sz; + if (aes->aOver == WC_AES_BLOCK_SIZE) { + /* We have filled up the block and can process. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + /* Reset count. */ + aes->aOver = 0; + } + /* Used up some data. */ + aSz -= sz; + a += sz; + } + + /* Calculate number of blocks of AAD and the leftover. */ + blocks = aSz / WC_AES_BLOCK_SIZE; + partial = aSz % WC_AES_BLOCK_SIZE; + /* GHASH full blocks now. */ + while (blocks--) { + GHASH_ONE_BLOCK_AARCH64(aes, a); + a += WC_AES_BLOCK_SIZE; + } + if (partial != 0) { + /* Cache the partial block. */ + XMEMCPY(AES_LASTGBLOCK(aes), a, partial); + aes->aOver = (byte)partial; + } + } + if (aes->aOver > 0 && cSz > 0 && c != NULL) { + /* No more AAD coming and we have a partial block. */ + /* Fill the rest of the block with zeros. */ + byte sz = WC_AES_BLOCK_SIZE - aes->aOver; + XMEMSET(AES_LASTGBLOCK(aes) + aes->aOver, 0, sz); + /* GHASH last AAD block. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + /* Clear partial count for next time through. */ + aes->aOver = 0; + } + + /* Hash in C, the Ciphertext */ + if (cSz != 0 && c != NULL) { + /* Update count of cipher text we have hashed. */ + aes->cSz += cSz; + if (aes->cOver > 0) { + /* Calculate amount we can use - fill up the block. */ + byte sz = WC_AES_BLOCK_SIZE - aes->cOver; + if (sz > cSz) { + sz = cSz; + } + XMEMCPY(AES_LASTGBLOCK(aes) + aes->cOver, c, sz); + /* Update count of unused encrypted counter. */ + aes->cOver += sz; + if (aes->cOver == WC_AES_BLOCK_SIZE) { + /* We have filled up the block and can process. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + /* Reset count. */ + aes->cOver = 0; + } + /* Used up some data. */ + cSz -= sz; + c += sz; + } + + /* Calculate number of blocks of cipher text and the leftover. */ + blocks = cSz / WC_AES_BLOCK_SIZE; + partial = cSz % WC_AES_BLOCK_SIZE; + /* GHASH full blocks now. */ + while (blocks--) { + GHASH_ONE_BLOCK_AARCH64(aes, c); + c += WC_AES_BLOCK_SIZE; + } + if (partial != 0) { + /* Cache the partial block. */ + XMEMCPY(AES_LASTGBLOCK(aes), c, partial); + aes->cOver = (byte)partial; + } + } +} + +/* Finalize the GHASH calculation. + * + * Complete hashing cipher text and hash the AAD and cipher text lengths. + * + * @param [in, out] aes AES GCM object. + * @param [out] s Authentication tag. + * @param [in] sSz Size of authentication tag required. + */ +static void GHASH_FINAL_AARCH64(Aes* aes, byte* s, word32 sSz) +{ + /* AAD block incomplete when > 0 */ + byte over = aes->aOver; + + if (aes->cOver > 0) { + /* Cipher text block incomplete. */ + over = aes->cOver; + } + if (over > 0) { + /* Zeroize the unused part of the block. */ + XMEMSET(AES_LASTGBLOCK(aes) + over, 0, WC_AES_BLOCK_SIZE - over); + /* Hash the last block of cipher text. */ + GHASH_ONE_BLOCK_AARCH64(aes, AES_LASTGBLOCK(aes)); + } + /* Hash in the lengths of AAD and cipher text in bits */ + GHASH_LEN_BLOCK_AARCH64(aes); + /* Copy the result into s. */ + XMEMCPY(s, AES_TAG(aes), sSz); +} + +void AES_GCM_init_AARCH64(Aes* aes, const byte* iv, word32 ivSz) +{ + ALIGN32 byte counter[WC_AES_BLOCK_SIZE]; + + if (ivSz == GCM_NONCE_MID_SZ) { + /* Counter is IV with bottom 4 bytes set to: 0x00,0x00,0x00,0x01. */ + XMEMCPY(counter, iv, ivSz); + XMEMSET(counter + GCM_NONCE_MID_SZ, 0, + WC_AES_BLOCK_SIZE - GCM_NONCE_MID_SZ - 1); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + /* Counter is GHASH of IV. */ + #ifdef OPENSSL_EXTRA + word32 aadTemp = aes->gcm.aadLen; + aes->gcm.aadLen = 0; + #endif + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + #ifdef OPENSSL_EXTRA + aes->gcm.aadLen = aadTemp; + #endif + } + + /* Copy in the counter for use with cipher. */ + XMEMCPY(AES_COUNTER(aes), counter, WC_AES_BLOCK_SIZE); + /* Encrypt initial counter into a buffer for GCM. */ + AES_encrypt_AARCH64(counter, AES_INITCTR(aes), (byte*)aes->key, + (int)aes->rounds); +} + +void AES_GCM_crypt_update_AARCH64(Aes* aes, byte* out, const byte* in, + word32 sz) +{ + word32 blocks; + word32 partial; + + /* Check if previous encrypted block was not used up. */ + if (aes->over > 0) { + byte pSz = WC_AES_BLOCK_SIZE - aes->over; if (pSz > sz) pSz = sz; - /* Use some/all of last encrypted block. */ - xorbufout(out, AES_LASTBLOCK(aes) + aes->over, in, pSz); - aes->over = (aes->over + pSz) & (WC_AES_BLOCK_SIZE - 1); + /* Use some/all of last encrypted block. */ + xorbufout(out, AES_LASTBLOCK(aes) + aes->over, in, pSz); + aes->over = (aes->over + pSz) & (WC_AES_BLOCK_SIZE - 1); + + /* Some data used. */ + sz -= pSz; + in += pSz; + out += pSz; + } + + /* Calculate the number of blocks needing to be encrypted and any leftover. + */ + blocks = sz / WC_AES_BLOCK_SIZE; + partial = sz & (WC_AES_BLOCK_SIZE - 1); + + /* Encrypt block by block. */ + while (blocks--) { + ALIGN32 byte scratch[WC_AES_BLOCK_SIZE]; + IncrementGcmCounter(AES_COUNTER(aes)); + /* Encrypt counter into a buffer. */ + AES_encrypt_AARCH64(AES_COUNTER(aes), scratch, (byte*)aes->key, + (int)aes->rounds); + /* XOR plain text into encrypted counter into cipher text buffer. */ + xorbufout(out, scratch, in, WC_AES_BLOCK_SIZE); + /* Data complete. */ + in += WC_AES_BLOCK_SIZE; + out += WC_AES_BLOCK_SIZE; + } + + if (partial != 0) { + /* Generate an extra block and use up as much as needed. */ + IncrementGcmCounter(AES_COUNTER(aes)); + /* Encrypt counter into cache. */ + AES_encrypt_AARCH64(AES_COUNTER(aes), AES_LASTBLOCK(aes), + (byte*)aes->key, (int)aes->rounds); + /* XOR plain text into encrypted counter into cipher text buffer. */ + xorbufout(out, AES_LASTBLOCK(aes), in, partial); + /* Keep amount of encrypted block used. */ + aes->over = partial; + } +} + +/* Calculates authentication tag for AES GCM. C implementation. + * + * @param [in, out] aes AES object. + * @param [out] authTag Buffer to store authentication tag in. + * @param [in] authTagSz Length of tag to create. + */ +void AES_GCM_final_AARCH64(Aes* aes, byte* authTag, word32 authTagSz) +{ + /* Calculate authentication tag. */ + GHASH_FINAL_AARCH64(aes, authTag, authTagSz); + /* XOR in as much of encrypted counter as is required. */ + xorbuf(authTag, AES_INITCTR(aes), authTagSz); +#ifdef OPENSSL_EXTRA + /* store AAD size for next call */ + aes->gcm.aadLen = aes->aSz; +#endif + /* Zeroize last block to protect sensitive data. */ + ForceZero(AES_LASTBLOCK(aes), WC_AES_BLOCK_SIZE); +} +#endif /* WOLFSSL_AESGCM_STREAM */ + +#ifdef WOLFSSL_AES_128 +/* internal function : see AES_GCM_encrypt_AARCH64 */ +static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[WC_AES_BLOCK_SIZE]; + byte scratch[WC_AES_BLOCK_SIZE]; + /* Noticed different optimization levels treated head of array different. + * Some cases was stack pointer plus offset others was a register containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. + */ + byte* ctr = counter; + byte* keyPt = (byte*)aes->key; + + XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v3.16b, v3.16b, v31.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v3.16b, v3.16b, v2.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "EOR v3.16b, v3.16b, v31.16b \n" + "EOR v3.16b, v3.16b, v2.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifdef WOLFSSL_AES_128 +/* internal function : see AES_GCM_encrypt_AARCH64 */ +static void Aes128GcmEncrypt_EOR3(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[WC_AES_BLOCK_SIZE]; + byte scratch[WC_AES_BLOCK_SIZE]; + /* Noticed different optimization levels treated head of array different. + * Some cases was stack pointer plus offset others was a register containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. + */ + byte* ctr = counter; + byte* keyPt = (byte*)aes->key; + + XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} +#endif /* WOLFSSL_AES_128 */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifdef WOLFSSL_AES_192 +/* internal function : see AES_GCM_encrypt_AARCH64 */ +static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[WC_AES_BLOCK_SIZE]; + byte scratch[WC_AES_BLOCK_SIZE]; + /* Noticed different optimization levels treated head of array different. + * Some cases was stack pointer plus offset others was a register containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. + */ + byte* ctr = counter; + byte* keyPt = (byte*)aes->key; + + XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v3.16b, v3.16b, v31.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v3.16b, v3.16b, v2.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "EOR v3.16b, v3.16b, v31.16b \n" + "EOR v3.16b, v3.16b, v2.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifdef WOLFSSL_AES_192 +/* internal function : see AES_GCM_encrypt_AARCH64 */ +static void Aes192GcmEncrypt_EOR3(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[WC_AES_BLOCK_SIZE]; + byte scratch[WC_AES_BLOCK_SIZE]; + /* Noticed different optimization levels treated head of array different. + * Some cases was stack pointer plus offset others was a register containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. + */ + byte* ctr = counter; + byte* keyPt = (byte*)aes->key; + + XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} +#endif /* WOLFSSL_AES_192 */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifdef WOLFSSL_AES_256 +/* internal function : see AES_GCM_encrypt_AARCH64 */ +static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[WC_AES_BLOCK_SIZE]; + byte scratch[WC_AES_BLOCK_SIZE]; + /* Noticed different optimization levels treated head of array different. + * Some cases was stack pointer plus offset others was a register containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. + */ + byte* ctr = counter; + byte* keyPt = (byte*)aes->key; + + XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v3.16b, v3.16b, v31.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v3.16b, v3.16b, v2.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "EOR v3.16b, v3.16b, v31.16b \n" + "EOR v3.16b, v3.16b, v2.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "EOR v15.16b, v15.16b, v31.16b \n" + "EOR v15.16b, v15.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "SUB %[Key], %[Key], #32 \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); +} +#endif /* WOLFSSL_AES_256 */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifdef WOLFSSL_AES_256 +/* internal function : see AES_GCM_encrypt_AARCH64 */ +static void Aes256GcmEncrypt_EOR3(Aes* aes, byte* out, const byte* in, + word32 sz, const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[WC_AES_BLOCK_SIZE]; + byte scratch[WC_AES_BLOCK_SIZE]; + /* Noticed different optimization levels treated head of array different. + * Some cases was stack pointer plus offset others was a register containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. + */ + byte* ctr = counter; + byte* keyPt = (byte*)aes->key; + + XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[WC_AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH_AARCH64(&aes->gcm, NULL, 0, iv, ivSz, counter, WC_AES_BLOCK_SIZE); + GMULT_AARCH64(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "SUB %[Key], %[Key], #32 \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" - /* Some data used. */ - sz -= pSz; - in += pSz; - out += pSz; - } + "LD1 {v31.2d}, [%[input]], #16 \n" - /* Calculate the number of blocks needing to be encrypted and any leftover. - */ - blocks = sz / WC_AES_BLOCK_SIZE; - partial = sz & (WC_AES_BLOCK_SIZE - 1); + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" - /* Encrypt block by block. */ - while (blocks--) { - ALIGN32 byte scratch[WC_AES_BLOCK_SIZE]; - IncrementGcmCounter(AES_COUNTER(aes)); - /* Encrypt counter into a buffer. */ - AES_encrypt_AARCH64(AES_COUNTER(aes), scratch, (byte*)aes->key, - (int)aes->rounds); - /* XOR plain text into encrypted counter into cipher text buffer. */ - xorbufout(out, scratch, in, WC_AES_BLOCK_SIZE); - /* Data complete. */ - in += WC_AES_BLOCK_SIZE; - out += WC_AES_BLOCK_SIZE; - } + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" - if (partial != 0) { - /* Generate an extra block and use up as much as needed. */ - IncrementGcmCounter(AES_COUNTER(aes)); - /* Encrypt counter into cache. */ - AES_encrypt_AARCH64(AES_COUNTER(aes), AES_LASTBLOCK(aes), - (byte*)aes->key, (int)aes->rounds); - /* XOR plain text into encrypted counter into cipher text buffer. */ - xorbufout(out, AES_LASTBLOCK(aes), in, partial); - /* Keep amount of encrypted block used. */ - aes->over = partial; - } + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); } +#endif /* WOLFSSL_AES_256 */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -/* Calculates authentication tag for AES GCM. C implementation. +/* aarch64 with PMULL and PMULL2 + * Encrypt and tag data using AES with GCM mode. + * aes: Aes structure having already been set with set key function + * out: encrypted data output buffer + * in: plain text input buffer + * sz: size of plain text and out buffer + * iv: initialization vector + * ivSz: size of iv buffer + * authTag: buffer to hold tag + * authTagSz: size of tag buffer + * authIn: additional data buffer + * authInSz: size of additional data buffer * - * @param [in, out] aes AES object. - * @param [out] authTag Buffer to store authentication tag in. - * @param [in] authTagSz Length of tag to create. + * Notes: + * GHASH multiplication based from Algorithm 1 from Intel GCM white paper. + * "Carry-Less Multiplication and Its Usage for Computing the GCM Mode" + * + * GHASH reduction Based from White Paper "Implementing GCM on ARMv8" + * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using + * Algorithm 5 */ -void AES_GCM_final_AARCH64(Aes* aes, byte* authTag, word32 authTagSz) +void AES_GCM_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) { - /* Calculate authentication tag. */ - GHASH_FINAL_AARCH64(aes, authTag, authTagSz); - /* XOR in as much of encrypted counter as is required. */ - xorbuf(authTag, AES_INITCTR(aes), authTagSz); -#ifdef OPENSSL_EXTRA - /* store AAD size for next call */ - aes->gcm.aadLen = aes->aSz; + switch (aes->rounds) { +#ifdef WOLFSSL_AES_128 + case 10: + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + Aes128GcmEncrypt_EOR3(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } + else + #endif + { + Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + } + break; #endif - /* Zeroize last block to protect sensitive data. */ - ForceZero(AES_LASTBLOCK(aes), WC_AES_BLOCK_SIZE); +#ifdef WOLFSSL_AES_192 + case 12: + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + Aes192GcmEncrypt_EOR3(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } + else + #endif + { + Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + } + break; +#endif +#ifdef WOLFSSL_AES_256 + case 14: + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + Aes256GcmEncrypt_EOR3(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } + else + #endif + { + Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, + authIn, authInSz); + } + break; +#endif + } } -#endif /* WOLFSSL_AESGCM_STREAM */ +#ifdef HAVE_AES_DECRYPT #ifdef WOLFSSL_AES_128 -/* internal function : see wc_AesGcmEncrypt */ -static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, +/* internal function : see AES_GCM_decrypt_AARCH64 */ +static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { byte counter[WC_AES_BLOCK_SIZE]; byte scratch[WC_AES_BLOCK_SIZE]; - /* Noticed different optimization levels treated head of array different. - * Some cases was stack pointer plus offset others was a register containing - * address. To make uniform for passing in to inline assembly code am using - * pointers to the head of each local array. - */ - byte* ctr = counter; + byte *ctr = counter; byte* keyPt = (byte*)aes->key; + int ret = 0; XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); if (ivSz == GCM_NONCE_MID_SZ) { @@ -2286,12 +12448,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -2300,12 +12458,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -2314,23 +12468,13 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v30.16b, #8 \n" "PMULL2 v14.1q, v30.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -2392,7 +12536,7 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v17.16b, v18.16b, v20.16b \n" "120: \n" - "# Encrypt plaintext and GHASH ciphertext \n" + "# Decrypt ciphertext and GHASH ciphertext \n" "LDR w12, [%[ctr], #12] \n" "MOV w11, %w[sz] \n" "REV w12, w12 \n" @@ -2455,7 +12599,7 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v9.16b, v18.16b, v20.16b \n" + "EOR v4.16b, v18.16b, v20.16b \n" "# Square H^3 - H^6 \n" "PMULL2 v19.1q, v25.2d, v25.2d \n" "PMULL v18.1q, v25.1d, v25.1d \n" @@ -2464,13 +12608,13 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v19.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v10.16b, v18.16b, v19.16b \n" + "EOR v9.16b, v18.16b, v19.16b \n" "# Multiply H and H^6 => H^7 \n" - "PMULL v18.1q, v10.1d, v16.1d \n" - "PMULL2 v19.1q, v10.2d, v16.2d \n" + "PMULL v18.1q, v9.1d, v16.1d \n" + "PMULL2 v19.1q, v9.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" - "PMULL v21.1q, v10.1d, v20.1d \n" - "PMULL2 v20.1q, v10.2d, v20.2d \n" + "PMULL v21.1q, v9.1d, v20.1d \n" + "PMULL2 v20.1q, v9.2d, v20.2d \n" "EOR v20.16b, v20.16b, v21.16b \n" "EXT v21.16b, v18.16b, v19.16b, #8 \n" "EOR v21.16b, v21.16b, v20.16b \n" @@ -2479,7 +12623,7 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v11.16b, v18.16b, v20.16b \n" + "EOR v10.16b, v18.16b, v20.16b \n" "# Square H^4 => H^8 \n" "PMULL2 v19.1q, v26.2d, v26.2d \n" "PMULL v18.1q, v26.1d, v26.1d \n" @@ -2488,9 +12632,9 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v19.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v4.16b, v18.16b, v19.16b \n" + "EOR v11.16b, v18.16b, v19.16b \n" - "# First encrypt - no GHASH \n" + "# First decrypt - no GHASH \n" "LDR q1, [%[Key]] \n" "# Calculate next 4 counters (+1-4) \n" "ADD w15, w12, #1 \n" @@ -2702,16 +12846,16 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v31.16b \n" "# XOR in input \n" - "EOR v12.16b, v12.16b, v5.16b \n" - "EOR v13.16b, v13.16b, v6.16b \n" - "EOR v14.16b, v14.16b, v7.16b \n" - "EOR v15.16b, v15.16b, v8.16b \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "81: \n" "LDR q1, [%[Key]] \n" @@ -2783,12 +12927,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else "EOR v20.16b, v20.16b, v3.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "# x[0-2] += C * H^3 \n" @@ -2807,12 +12947,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else "EOR v19.16b, v19.16b, v3.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "LDR q1, [%[Key], #32] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -2832,17 +12968,13 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else "EOR v18.16b, v18.16b, v3.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "# x[0-2] += C * H^5 \n" - "PMULL v2.1q, v15.1d, v9.1d \n" - "PMULL2 v3.1q, v15.2d, v9.2d \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" "AESE v29.16b, v22.16b \n" "AESMC v29.16b, v29.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -2853,21 +12985,17 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LDR q22, [%[Key], #48] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" - "PMULL v3.1q, v15.1d, v9.1d \n" - "PMULL2 v15.1q, v15.2d, v9.2d \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else "EOR v15.16b, v15.16b, v3.16b \n" "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" "# x[0-2] += C * H^6 \n" - "PMULL v2.1q, v14.1d, v10.1d \n" - "PMULL2 v3.1q, v14.2d, v10.2d \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -2877,21 +13005,17 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v14.16b, v14.16b, v14.16b, #8 \n" "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" - "PMULL v3.1q, v14.1d, v10.1d \n" - "PMULL2 v14.1q, v14.2d, v10.2d \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else "EOR v14.16b, v14.16b, v3.16b \n" "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^7 \n" - "PMULL v2.1q, v13.1d, v11.1d \n" - "PMULL2 v3.1q, v13.2d, v11.2d \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" "LDR q1, [%[Key], #64] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -2902,21 +13026,17 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v13.16b, v13.16b, v13.16b, #8 \n" "AESE v7.16b, v22.16b \n" "AESMC v7.16b, v7.16b \n" - "PMULL v3.1q, v13.1d, v11.1d \n" - "PMULL2 v13.1q, v13.2d, v11.2d \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" "AESE v8.16b, v22.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else "EOR v13.16b, v13.16b, v3.16b \n" "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" "# x[0-2] += C * H^8 \n" - "PMULL v2.1q, v12.1d, v4.1d \n" - "PMULL2 v3.1q, v12.2d, v4.2d \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -2925,18 +13045,14 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "EXT v12.16b, v12.16b, v12.16b, #8 \n" "AESE v30.16b, v22.16b \n" - "AESMC v30.16b, v30.16b \n" - "PMULL v3.1q, v12.1d, v4.1d \n" - "PMULL2 v12.1q, v12.2d, v4.2d \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" "LDR q22, [%[Key], #80] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else "EOR v12.16b, v12.16b, v3.16b \n" "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" "# Reduce X = x[0-2] \n" @@ -2946,16 +13062,10 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v2.1q, v0.2d, v23.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v1.16b \n" "AESMC v27.16b, v27.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" "PMULL2 v2.1q, v3.2d, v23.2d \n" @@ -3054,16 +13164,16 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v31.16b \n" "# XOR in input \n" - "EOR v12.16b, v12.16b, v5.16b \n" - "EOR v13.16b, v13.16b, v6.16b \n" - "EOR v14.16b, v14.16b, v7.16b \n" - "EOR v15.16b, v15.16b, v8.16b \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #128 \n" "BGE 81b \n" @@ -3093,12 +13203,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v3.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else "EOR v20.16b, v20.16b, v3.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v2.1q, v19.1d, v25.1d \n" "PMULL2 v3.1q, v19.2d, v25.2d \n" @@ -3107,12 +13213,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v3.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else "EOR v19.16b, v19.16b, v3.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v2.1q, v18.1d, v26.1d \n" "PMULL2 v3.1q, v18.2d, v26.2d \n" @@ -3121,79 +13223,53 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else "EOR v18.16b, v18.16b, v3.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^5 \n" - "PMULL v2.1q, v15.1d, v9.1d \n" - "PMULL2 v3.1q, v15.2d, v9.2d \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v15.16b, v15.16b, v15.16b, #8 \n" - "PMULL v3.1q, v15.1d, v9.1d \n" - "PMULL2 v15.1q, v15.2d, v9.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" "EOR v15.16b, v15.16b, v3.16b \n" "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^6 \n" - "PMULL v2.1q, v14.1d, v10.1d \n" - "PMULL2 v3.1q, v14.2d, v10.2d \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v14.16b, v14.16b, v14.16b, #8 \n" - "PMULL v3.1q, v14.1d, v10.1d \n" - "PMULL2 v14.1q, v14.2d, v10.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" "EOR v14.16b, v14.16b, v3.16b \n" "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^7 \n" - "PMULL v2.1q, v13.1d, v11.1d \n" - "PMULL2 v3.1q, v13.2d, v11.2d \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "PMULL v3.1q, v13.1d, v11.1d \n" - "PMULL2 v13.1q, v13.2d, v11.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" "EOR v13.16b, v13.16b, v3.16b \n" "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^8 \n" - "PMULL v2.1q, v12.1d, v4.1d \n" - "PMULL2 v3.1q, v12.2d, v4.2d \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v12.16b, v12.16b, v12.16b, #8 \n" - "PMULL v3.1q, v12.1d, v4.1d \n" - "PMULL2 v12.1q, v12.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" "EOR v12.16b, v12.16b, v3.16b \n" "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v3.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v2.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v2.1q, v3.2d, v23.2d \n" "MOV v17.D[1], v3.D[0] \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -3207,7 +13283,7 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CMP w11, #64 \n" "BLT 10f \n" - "# First encrypt - no GHASH \n" + "# First decrypt - no GHASH \n" "# Calculate next 4 counters (+1-4) \n" "ADD w15, w12, #1 \n" "MOV v27.16b, v22.16b \n" @@ -3312,12 +13388,12 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v11.16b \n" "# XOR in input \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" "# Store cipher text \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #64 \n" "BLT 12f \n" @@ -3382,12 +13458,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v28.16b, v3.16b \n" "AESMC v28.16b, v28.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v29.16b, v3.16b \n" "AESMC v29.16b, v29.16b \n" "# x[0-2] += C * H^3 \n" @@ -3406,12 +13478,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v29.16b, v4.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v4.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^4 \n" @@ -3430,12 +13498,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v30.16b, v5.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "SUB w11, w11, #64 \n" "AESE v27.16b, v6.16b \n" "AESMC v27.16b, v27.16b \n" @@ -3446,16 +13510,10 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v0.2d, v23.2d \n" "AESE v29.16b, v6.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v6.16b \n" "AESMC v30.16b, v30.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v7.16b \n" "AESMC v27.16b, v27.16b \n" "PMULL2 v14.1q, v15.2d, v23.2d \n" @@ -3495,12 +13553,12 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v11.16b \n" "# XOR in input \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" "# Store cipher text \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #64 \n" "BGE 11b \n" @@ -3526,12 +13584,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -3540,12 +13594,8 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -3554,23 +13604,13 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v14.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -3579,7 +13619,7 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CBZ w11, 30f \n" "CMP w11, #16 \n" "BLT 20f \n" - "# Encrypt first block for GHASH \n" + "# Decrypt first block for GHASH \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" @@ -3601,28 +13641,26 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v31.2d}, [%[input]], #16 \n" + "LD1 {v28.2d}, [%[input]], #16 \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v10.16b \n" "EOR v0.16b, v0.16b, v11.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[out]], #16 \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" - "# When only one full block to encrypt go straight to GHASH \n" + "# When only one full block to decrypt go straight to GHASH \n" "CMP w11, 16 \n" "BLT 1f \n" - "LD1 {v31.2d}, [%[input]], #16 \n" - - "# Interweave GHASH and encrypt if more then 1 block \n" + "# Interweave GHASH and decrypt if more then 1 block \n" "2: \n" - "RBIT v15.16b, v15.16b \n" + "RBIT v28.16b, v28.16b \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" "MOV v0.S[3], w13 \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "AESE v0.16b, v1.16b \n" @@ -3652,24 +13690,22 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" + "LD1 {v28.2d}, [%[input]], #16 \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" "EOR v0.16b, v0.16b, v11.16b \n \n" "EOR v17.16b, v18.16b, v20.16b \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[out]], #16 \n" - "CMP w11, 16 \n" - "BLT 1f \n" - - "LD1 {v31.2d}, [%[input]], #16 \n" - "B 2b \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "CMP w11, #16 \n" + "BGE 2b \n" "# GHASH on last block \n" "1: \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" @@ -3697,78 +13733,71 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CBNZ x15, 23b \n" "SUB %[scratch], %[scratch], x11 \n" "LD1 {v31.2d}, [%[scratch]] \n" + "RBIT v31.16b, v31.16b \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v31.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" "AESE v0.16b, v1.16b \n" "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" "AESE v0.16b, v2.16b \n" "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" "AESE v0.16b, v3.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" "AESE v0.16b, v4.16b \n" "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" "AESE v0.16b, v5.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" "AESE v0.16b, v6.16b \n" "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" "AESE v0.16b, v7.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "RBIT v31.16b, v31.16b \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" "EOR v0.16b, v0.16b, v11.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[scratch]] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n \n" + "ST1 {v0.2d}, [%[scratch]] \n" "MOV x15, x11 \n" "24: \n" "LDRB w14, [%[scratch]], #1 \n" "STRB w14, [%[out]], #1 \n" "SUB x15, x15, #1 \n" "CBNZ x15, 24b \n" - "MOV x15, #16 \n" - "EOR w14, w14, w14 \n" - "SUB x15, x15, x11 \n" - "25: \n" - "STRB w14, [%[scratch]], #1 \n" - "SUB x15, x15, #1 \n" - "CBNZ x15, 25b \n" - "SUB %[scratch], %[scratch], #16 \n" - "LD1 {v15.2d}, [%[scratch]] \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" - "PMULL v18.1q, v17.1d, v16.1d \n" - "PMULL2 v19.1q, v17.2d, v16.2d \n" - "EXT v20.16b, v16.16b, v16.16b, #8 \n" - "PMULL v21.1q, v17.1d, v20.1d \n" - "PMULL2 v20.1q, v17.2d, v20.2d \n" - "EOR v20.16b, v20.16b, v21.16b \n" - "EXT v21.16b, v18.16b, v19.16b, #8 \n" - "EOR v21.16b, v21.16b, v20.16b \n" - "# Reduce \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "EOR v21.16b, v21.16b, v20.16b \n" - "PMULL2 v20.1q, v21.2d, v23.2d \n" - "MOV v18.D[1], v21.D[0] \n" - "EOR v17.16b, v18.16b, v20.16b \n" + "SUB %[scratch], %[scratch], x11 \n" "30: \n" "# store current counter value at the end \n" "REV w13, w12 \n" "MOV v22.S[3], w13 \n" - "LD1 {v0.2d}, [%[ctr]] \n" - "ST1 {v22.2d}, [%[ctr]] \n" + "LD1 {v0.16b}, [%[ctr]] \n" + "ST1 {v22.16b}, [%[ctr]] \n" "LSL %x[aSz], %x[aSz], #3 \n" "LSL %x[sz], %x[sz], #3 \n" - "MOV v15.d[0], %x[aSz] \n" - "MOV v15.d[1], %x[sz] \n" - "REV64 v15.16b, v15.16b \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "MOV v28.d[0], %x[aSz] \n" + "MOV v28.d[1], %x[sz] \n" + "REV64 v28.16b, v28.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" @@ -3807,21 +13836,42 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v0.16b, v0.16b, v17.16b \n \n" "CMP %w[tagSz], #16 \n" "BNE 40f \n" - "ST1 {v0.2d}, [%[tag]] \n" + "LD1 {v1.2d}, [%[tag]] \n" "B 41f \n" "40: \n" - "ST1 {v0.2d}, [%[scratch]] \n" - "MOV x15, %x[tagSz] \n" - "44: \n" - "LDRB w14, [%[scratch]], #1 \n" - "STRB w14, [%[tag]], #1 \n" + "EOR v1.16b, v1.16b, v1.16b \n" + "MOV x15, %x[tagSz] \n" + "ST1 {v1.2d}, [%[scratch]] \n" + "43: \n" + "LDRB w14, [%[tag]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" "SUB x15, x15, #1 \n" - "CBNZ x15, 44b \n" + "CBNZ x15, 43b \n" "SUB %[scratch], %[scratch], %x[tagSz] \n" + "LD1 {v1.2d}, [%[scratch]] \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV w14, #16 \n" + "SUB w14, w14, %w[tagSz] \n" + "ADD %[scratch], %[scratch], %x[tagSz] \n" + "44: \n" + "STRB wzr, [%[scratch]], #1 \n" + "SUB w14, w14, #1 \n" + "CBNZ w14, 44b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v0.2d}, [%[scratch]] \n" "41: \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "MOV v1.D[0], v0.D[1] \n" + "EOR v0.8b, v0.8b, v1.8b \n" + "MOV %x[ret], v0.D[0] \n" + "CMP %x[ret], #0 \n" + "MOV w11, #-180 \n" + "CSETM %w[ret], ne \n" + "AND %w[ret], %w[ret], w11 \n" : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), - [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn), + [ret] "+r" (ret) : [ctr] "r" (ctr), [scratch] "r" (scratch), [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", @@ -3830,23 +13880,22 @@ static void Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); + + return ret; } #endif /* WOLFSSL_AES_128 */ -#ifdef WOLFSSL_AES_192 -/* internal function : see wc_AesGcmEncrypt */ -static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifdef WOLFSSL_AES_128 +/* internal function : see AES_GCM_decrypt_AARCH64 */ +static int Aes128GcmDecrypt_EOR3(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { byte counter[WC_AES_BLOCK_SIZE]; byte scratch[WC_AES_BLOCK_SIZE]; - /* Noticed different optimization levels treated head of array different. - * Some cases was stack pointer plus offset others was a register containing - * address. To make uniform for passing in to inline assembly code am using - * pointers to the head of each local array. - */ - byte* ctr = counter; + byte *ctr = counter; byte* keyPt = (byte*)aes->key; + int ret = 0; XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); if (ivSz == GCM_NONCE_MID_SZ) { @@ -3929,12 +13978,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -3943,12 +13987,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -3957,23 +13996,11 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v30.16b, #8 \n" "PMULL2 v14.1q, v30.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -4035,7 +14062,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v17.16b, v18.16b, v20.16b \n" "120: \n" - "# Encrypt plaintext and GHASH ciphertext \n" + "# Decrypt ciphertext and GHASH ciphertext \n" "LDR w12, [%[ctr], #12] \n" "MOV w11, %w[sz] \n" "REV w12, w12 \n" @@ -4098,7 +14125,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v9.16b, v18.16b, v20.16b \n" + "EOR v4.16b, v18.16b, v20.16b \n" "# Square H^3 - H^6 \n" "PMULL2 v19.1q, v25.2d, v25.2d \n" "PMULL v18.1q, v25.1d, v25.1d \n" @@ -4107,13 +14134,13 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v19.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v10.16b, v18.16b, v19.16b \n" + "EOR v9.16b, v18.16b, v19.16b \n" "# Multiply H and H^6 => H^7 \n" - "PMULL v18.1q, v10.1d, v16.1d \n" - "PMULL2 v19.1q, v10.2d, v16.2d \n" + "PMULL v18.1q, v9.1d, v16.1d \n" + "PMULL2 v19.1q, v9.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" - "PMULL v21.1q, v10.1d, v20.1d \n" - "PMULL2 v20.1q, v10.2d, v20.2d \n" + "PMULL v21.1q, v9.1d, v20.1d \n" + "PMULL2 v20.1q, v9.2d, v20.2d \n" "EOR v20.16b, v20.16b, v21.16b \n" "EXT v21.16b, v18.16b, v19.16b, #8 \n" "EOR v21.16b, v21.16b, v20.16b \n" @@ -4122,7 +14149,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v11.16b, v18.16b, v20.16b \n" + "EOR v10.16b, v18.16b, v20.16b \n" "# Square H^4 => H^8 \n" "PMULL2 v19.1q, v26.2d, v26.2d \n" "PMULL v18.1q, v26.1d, v26.1d \n" @@ -4131,9 +14158,9 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v19.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v4.16b, v18.16b, v19.16b \n" + "EOR v11.16b, v18.16b, v19.16b \n" - "# First encrypt - no GHASH \n" + "# First decrypt - no GHASH \n" "LDR q1, [%[Key]] \n" "# Calculate next 4 counters (+1-4) \n" "ADD w15, w12, #1 \n" @@ -4308,42 +14335,8 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" - "LDR q22, [%[Key], #144] \n" - "AESE v5.16b, v1.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v1.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v1.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v1.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v1.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v1.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v1.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v1.16b \n" - "AESMC v30.16b, v30.16b \n" - "LDR q1, [%[Key], #160] \n" - "AESE v5.16b, v22.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v22.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v22.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v22.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v22.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v22.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v22.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v22.16b \n" - "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #176] \n" + "LDP q22, q31, [%[Key], #144] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -4379,16 +14372,16 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v31.16b \n" "# XOR in input \n" - "EOR v12.16b, v12.16b, v5.16b \n" - "EOR v13.16b, v13.16b, v6.16b \n" - "EOR v14.16b, v14.16b, v7.16b \n" - "EOR v15.16b, v15.16b, v8.16b \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "81: \n" "LDR q1, [%[Key]] \n" @@ -4460,12 +14453,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else - "EOR v20.16b, v20.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "# x[0-2] += C * H^3 \n" @@ -4484,12 +14472,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else - "EOR v19.16b, v19.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "LDR q1, [%[Key], #32] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -4509,17 +14492,12 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else - "EOR v18.16b, v18.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "# x[0-2] += C * H^5 \n" - "PMULL v2.1q, v15.1d, v9.1d \n" - "PMULL2 v3.1q, v15.2d, v9.2d \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" "AESE v29.16b, v22.16b \n" "AESMC v29.16b, v29.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -4530,21 +14508,16 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LDR q22, [%[Key], #48] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" - "PMULL v3.1q, v15.1d, v9.1d \n" - "PMULL2 v15.1q, v15.2d, v9.2d \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else - "EOR v15.16b, v15.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" "# x[0-2] += C * H^6 \n" - "PMULL v2.1q, v14.1d, v10.1d \n" - "PMULL2 v3.1q, v14.2d, v10.2d \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -4554,21 +14527,16 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v14.16b, v14.16b, v14.16b, #8 \n" "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" - "PMULL v3.1q, v14.1d, v10.1d \n" - "PMULL2 v14.1q, v14.2d, v10.2d \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else - "EOR v14.16b, v14.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^7 \n" - "PMULL v2.1q, v13.1d, v11.1d \n" - "PMULL2 v3.1q, v13.2d, v11.2d \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" "LDR q1, [%[Key], #64] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -4579,21 +14547,16 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v13.16b, v13.16b, v13.16b, #8 \n" "AESE v7.16b, v22.16b \n" "AESMC v7.16b, v7.16b \n" - "PMULL v3.1q, v13.1d, v11.1d \n" - "PMULL2 v13.1q, v13.2d, v11.2d \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" "AESE v8.16b, v22.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else - "EOR v13.16b, v13.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" "# x[0-2] += C * H^8 \n" - "PMULL v2.1q, v12.1d, v4.1d \n" - "PMULL2 v3.1q, v12.2d, v4.2d \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -4603,81 +14566,35 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" - "PMULL v3.1q, v12.1d, v4.1d \n" - "PMULL2 v12.1q, v12.2d, v4.2d \n" - "LDR q22, [%[Key], #80] \n" - "AESE v5.16b, v1.16b \n" - "AESMC v5.16b, v5.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else - "EOR v12.16b, v12.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ - "AESE v6.16b, v1.16b \n" - "AESMC v6.16b, v6.16b \n" - "# Reduce X = x[0-2] \n" - "EXT v3.16b, v17.16b, v0.16b, #8 \n" - "AESE v7.16b, v1.16b \n" - "AESMC v7.16b, v7.16b \n" - "PMULL2 v2.1q, v0.2d, v23.2d \n" - "AESE v8.16b, v1.16b \n" - "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else - "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ - "AESE v27.16b, v1.16b \n" - "AESMC v27.16b, v27.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ - "AESE v28.16b, v1.16b \n" - "AESMC v28.16b, v28.16b \n" - "PMULL2 v2.1q, v3.2d, v23.2d \n" - "MOV v17.D[1], v3.D[0] \n" - "AESE v29.16b, v1.16b \n" - "AESMC v29.16b, v29.16b \n" - "EOR v17.16b, v17.16b, v2.16b \n" - "AESE v30.16b, v1.16b \n" - "AESMC v30.16b, v30.16b \n" - "SUB w11, w11, #128 \n" - "LDR q1, [%[Key], #96] \n" - "AESE v5.16b, v22.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v22.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v22.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v22.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v22.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v22.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v22.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v22.16b \n" - "AESMC v30.16b, v30.16b \n" - "LDR q22, [%[Key], #112] \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" + "LDR q22, [%[Key], #80] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" "AESE v27.16b, v1.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" - "LDR q1, [%[Key], #128] \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v22.16b \n" @@ -4694,7 +14611,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" - "LDR q22, [%[Key], #144] \n" + "LDR q22, [%[Key], #112] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -4711,7 +14628,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" - "LDR q1, [%[Key], #160] \n" + "LDR q1, [%[Key], #128] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v22.16b \n" @@ -4729,7 +14646,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #176] \n" + "LDP q22, q31, [%[Key], #144] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -4765,16 +14682,16 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v31.16b \n" "# XOR in input \n" - "EOR v12.16b, v12.16b, v5.16b \n" - "EOR v13.16b, v13.16b, v6.16b \n" - "EOR v14.16b, v14.16b, v7.16b \n" - "EOR v15.16b, v15.16b, v8.16b \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #128 \n" "BGE 81b \n" @@ -4804,12 +14721,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v3.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else - "EOR v20.16b, v20.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v2.1q, v19.1d, v25.1d \n" "PMULL2 v3.1q, v19.2d, v25.2d \n" @@ -4818,12 +14730,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v3.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else - "EOR v19.16b, v19.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v2.1q, v18.1d, v26.1d \n" "PMULL2 v3.1q, v18.2d, v26.2d \n" @@ -4832,79 +14739,47 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else - "EOR v18.16b, v18.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^5 \n" - "PMULL v2.1q, v15.1d, v9.1d \n" - "PMULL2 v3.1q, v15.2d, v9.2d \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v15.16b, v15.16b, v15.16b, #8 \n" - "PMULL v3.1q, v15.1d, v9.1d \n" - "PMULL2 v15.1q, v15.2d, v9.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else - "EOR v15.16b, v15.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^6 \n" - "PMULL v2.1q, v14.1d, v10.1d \n" - "PMULL2 v3.1q, v14.2d, v10.2d \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v14.16b, v14.16b, v14.16b, #8 \n" - "PMULL v3.1q, v14.1d, v10.1d \n" - "PMULL2 v14.1q, v14.2d, v10.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else - "EOR v14.16b, v14.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^7 \n" - "PMULL v2.1q, v13.1d, v11.1d \n" - "PMULL2 v3.1q, v13.2d, v11.2d \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "PMULL v3.1q, v13.1d, v11.1d \n" - "PMULL2 v13.1q, v13.2d, v11.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else - "EOR v13.16b, v13.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^8 \n" - "PMULL v2.1q, v12.1d, v4.1d \n" - "PMULL2 v3.1q, v12.2d, v4.2d \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v12.16b, v12.16b, v12.16b, #8 \n" - "PMULL v3.1q, v12.1d, v4.1d \n" - "PMULL2 v12.1q, v12.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else - "EOR v12.16b, v12.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v3.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v2.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else - "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v2.1q, v3.2d, v23.2d \n" "MOV v17.D[1], v3.D[0] \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -4914,12 +14789,11 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" - "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" "# Can we do 4 blocks at a time? \n" "CMP w11, #64 \n" "BLT 10f \n" - "# First encrypt - no GHASH \n" + "# First decrypt - no GHASH \n" "# Calculate next 4 counters (+1-4) \n" "ADD w15, w12, #1 \n" "MOV v27.16b, v22.16b \n" @@ -4996,6 +14870,8 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v7.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v8.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v8.16b \n" @@ -5012,40 +14888,22 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" - "AESMC v27.16b, v27.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" "AESE v28.16b, v10.16b \n" - "AESMC v28.16b, v28.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" "AESE v29.16b, v10.16b \n" - "AESMC v29.16b, v29.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" "AESE v30.16b, v10.16b \n" - "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v11.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v11.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v11.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v11.16b \n" - "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v12.16b \n" - "EOR v27.16b, v27.16b, v13.16b \n" - "AESE v28.16b, v12.16b \n" - "EOR v28.16b, v28.16b, v13.16b \n" - "AESE v29.16b, v12.16b \n" - "EOR v29.16b, v29.16b, v13.16b \n" - "AESE v30.16b, v12.16b \n" - "EOR v30.16b, v30.16b, v13.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" "# XOR in input \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" "# Store cipher text \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #64 \n" "BLT 12f \n" @@ -5110,12 +14968,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v28.16b, v3.16b \n" "AESMC v28.16b, v28.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v29.16b, v3.16b \n" "AESMC v29.16b, v29.16b \n" "# x[0-2] += C * H^3 \n" @@ -5134,12 +14987,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v29.16b, v4.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v4.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^4 \n" @@ -5158,12 +15006,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v30.16b, v5.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "SUB w11, w11, #64 \n" "AESE v27.16b, v6.16b \n" "AESMC v27.16b, v27.16b \n" @@ -5174,16 +15017,9 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v0.2d, v23.2d \n" "AESE v29.16b, v6.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v6.16b \n" "AESMC v30.16b, v30.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v7.16b \n" "AESMC v27.16b, v27.16b \n" "PMULL2 v14.1q, v15.2d, v23.2d \n" @@ -5195,6 +15031,8 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v7.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v8.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v8.16b \n" @@ -5211,40 +15049,22 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" - "AESMC v27.16b, v27.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" "AESE v28.16b, v10.16b \n" - "AESMC v28.16b, v28.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" "AESE v29.16b, v10.16b \n" - "AESMC v29.16b, v29.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" "AESE v30.16b, v10.16b \n" - "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v11.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v11.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v11.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v11.16b \n" - "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v12.16b \n" - "EOR v27.16b, v27.16b, v13.16b \n" - "AESE v28.16b, v12.16b \n" - "EOR v28.16b, v28.16b, v13.16b \n" - "AESE v29.16b, v12.16b \n" - "EOR v29.16b, v29.16b, v13.16b \n" - "AESE v30.16b, v12.16b \n" - "EOR v30.16b, v30.16b, v13.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" "# XOR in input \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" "# Store cipher text \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #64 \n" "BGE 11b \n" @@ -5270,12 +15090,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -5284,12 +15099,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -5298,23 +15108,11 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v14.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -5323,7 +15121,7 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CBZ w11, 30f \n" "CMP w11, #16 \n" "BLT 20f \n" - "# Encrypt first block for GHASH \n" + "# Decrypt first block for GHASH \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" @@ -5345,32 +15143,26 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v31.2d}, [%[input]], #16 \n" + "LD1 {v28.2d}, [%[input]], #16 \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[out]], #16 \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" - "# When only one full block to encrypt go straight to GHASH \n" + "# When only one full block to decrypt go straight to GHASH \n" "CMP w11, 16 \n" "BLT 1f \n" - "LD1 {v31.2d}, [%[input]], #16 \n" - - "# Interweave GHASH and encrypt if more then 1 block \n" + "# Interweave GHASH and decrypt if more then 1 block \n" "2: \n" - "RBIT v15.16b, v15.16b \n" + "RBIT v28.16b, v28.16b \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" "MOV v0.S[3], w13 \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "AESE v0.16b, v1.16b \n" @@ -5400,28 +15192,22 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" + "LD1 {v28.2d}, [%[input]], #16 \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" "EOR v17.16b, v18.16b, v20.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[out]], #16 \n" - "CMP w11, 16 \n" - "BLT 1f \n" - - "LD1 {v31.2d}, [%[input]], #16 \n" - "B 2b \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "CMP w11, #16 \n" + "BGE 2b \n" "# GHASH on last block \n" "1: \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" @@ -5449,82 +15235,71 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CBNZ x15, 23b \n" "SUB %[scratch], %[scratch], x11 \n" "LD1 {v31.2d}, [%[scratch]] \n" + "RBIT v31.16b, v31.16b \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v31.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" "AESE v0.16b, v1.16b \n" "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" "AESE v0.16b, v2.16b \n" "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" "AESE v0.16b, v3.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" "AESE v0.16b, v4.16b \n" "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" "AESE v0.16b, v5.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" "AESE v0.16b, v6.16b \n" "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" "AESE v0.16b, v7.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "RBIT v31.16b, v31.16b \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[scratch]] \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n \n" + "ST1 {v0.2d}, [%[scratch]] \n" "MOV x15, x11 \n" "24: \n" "LDRB w14, [%[scratch]], #1 \n" "STRB w14, [%[out]], #1 \n" "SUB x15, x15, #1 \n" "CBNZ x15, 24b \n" - "MOV x15, #16 \n" - "EOR w14, w14, w14 \n" - "SUB x15, x15, x11 \n" - "25: \n" - "STRB w14, [%[scratch]], #1 \n" - "SUB x15, x15, #1 \n" - "CBNZ x15, 25b \n" - "SUB %[scratch], %[scratch], #16 \n" - "LD1 {v15.2d}, [%[scratch]] \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" - "PMULL v18.1q, v17.1d, v16.1d \n" - "PMULL2 v19.1q, v17.2d, v16.2d \n" - "EXT v20.16b, v16.16b, v16.16b, #8 \n" - "PMULL v21.1q, v17.1d, v20.1d \n" - "PMULL2 v20.1q, v17.2d, v20.2d \n" - "EOR v20.16b, v20.16b, v21.16b \n" - "EXT v21.16b, v18.16b, v19.16b, #8 \n" - "EOR v21.16b, v21.16b, v20.16b \n" - "# Reduce \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "EOR v21.16b, v21.16b, v20.16b \n" - "PMULL2 v20.1q, v21.2d, v23.2d \n" - "MOV v18.D[1], v21.D[0] \n" - "EOR v17.16b, v18.16b, v20.16b \n" + "SUB %[scratch], %[scratch], x11 \n" "30: \n" "# store current counter value at the end \n" "REV w13, w12 \n" "MOV v22.S[3], w13 \n" - "LD1 {v0.2d}, [%[ctr]] \n" - "ST1 {v22.2d}, [%[ctr]] \n" + "LD1 {v0.16b}, [%[ctr]] \n" + "ST1 {v22.16b}, [%[ctr]] \n" "LSL %x[aSz], %x[aSz], #3 \n" "LSL %x[sz], %x[sz], #3 \n" - "MOV v15.d[0], %x[aSz] \n" - "MOV v15.d[1], %x[sz] \n" - "REV64 v15.16b, v15.16b \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "MOV v28.d[0], %x[aSz] \n" + "MOV v28.d[1], %x[sz] \n" + "REV64 v28.16b, v28.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" @@ -5558,30 +15333,47 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "EOR v17.16b, v18.16b, v20.16b \n" "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" "RBIT v17.16b, v17.16b \n" "EOR v0.16b, v0.16b, v17.16b \n \n" "CMP %w[tagSz], #16 \n" "BNE 40f \n" - "ST1 {v0.2d}, [%[tag]] \n" + "LD1 {v1.2d}, [%[tag]] \n" "B 41f \n" "40: \n" - "ST1 {v0.2d}, [%[scratch]] \n" + "EOR v1.16b, v1.16b, v1.16b \n" "MOV x15, %x[tagSz] \n" - "44: \n" - "LDRB w14, [%[scratch]], #1 \n" - "STRB w14, [%[tag]], #1 \n" + "ST1 {v1.2d}, [%[scratch]] \n" + "43: \n" + "LDRB w14, [%[tag]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" "SUB x15, x15, #1 \n" - "CBNZ x15, 44b \n" + "CBNZ x15, 43b \n" "SUB %[scratch], %[scratch], %x[tagSz] \n" + "LD1 {v1.2d}, [%[scratch]] \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV w14, #16 \n" + "SUB w14, w14, %w[tagSz] \n" + "ADD %[scratch], %[scratch], %x[tagSz] \n" + "44: \n" + "STRB wzr, [%[scratch]], #1 \n" + "SUB w14, w14, #1 \n" + "CBNZ w14, 44b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v0.2d}, [%[scratch]] \n" "41: \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "MOV v1.D[0], v0.D[1] \n" + "EOR v0.8b, v0.8b, v1.8b \n" + "MOV %x[ret], v0.D[0] \n" + "CMP %x[ret], #0 \n" + "MOV w11, #-180 \n" + "CSETM %w[ret], ne \n" + "AND %w[ret], %w[ret], w11 \n" : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), - [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn), + [ret] "+r" (ret) : [ctr] "r" (ctr), [scratch] "r" (scratch), [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", @@ -5590,23 +15382,22 @@ static void Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); + + return ret; } -#endif /* WOLFSSL_AES_192 */ -#ifdef WOLFSSL_AES_256 -/* internal function : see wc_AesGcmEncrypt */ -static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, +#endif /* WOLFSSL_AES_128 */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifdef WOLFSSL_AES_192 +/* internal function : see AES_GCM_decrypt_AARCH64 */ +static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { byte counter[WC_AES_BLOCK_SIZE]; byte scratch[WC_AES_BLOCK_SIZE]; - /* Noticed different optimization levels treated head of array different. - * Some cases was stack pointer plus offset others was a register containing - * address. To make uniform for passing in to inline assembly code am using - * pointers to the head of each local array. - */ - byte* ctr = counter; + byte *ctr = counter; byte* keyPt = (byte*)aes->key; + int ret = 0; XMEMSET(counter, 0, WC_AES_BLOCK_SIZE); if (ivSz == GCM_NONCE_MID_SZ) { @@ -5689,12 +15480,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -5703,12 +15490,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -5717,23 +15500,13 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v30.16b, #8 \n" "PMULL2 v14.1q, v30.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -5795,7 +15568,7 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v17.16b, v18.16b, v20.16b \n" "120: \n" - "# Encrypt plaintext and GHASH ciphertext \n" + "# Decrypt ciphertext and GHASH ciphertext \n" "LDR w12, [%[ctr], #12] \n" "MOV w11, %w[sz] \n" "REV w12, w12 \n" @@ -5858,7 +15631,7 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v9.16b, v18.16b, v20.16b \n" + "EOR v4.16b, v18.16b, v20.16b \n" "# Square H^3 - H^6 \n" "PMULL2 v19.1q, v25.2d, v25.2d \n" "PMULL v18.1q, v25.1d, v25.1d \n" @@ -5867,13 +15640,13 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v19.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v10.16b, v18.16b, v19.16b \n" + "EOR v9.16b, v18.16b, v19.16b \n" "# Multiply H and H^6 => H^7 \n" - "PMULL v18.1q, v10.1d, v16.1d \n" - "PMULL2 v19.1q, v10.2d, v16.2d \n" + "PMULL v18.1q, v9.1d, v16.1d \n" + "PMULL2 v19.1q, v9.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" - "PMULL v21.1q, v10.1d, v20.1d \n" - "PMULL2 v20.1q, v10.2d, v20.2d \n" + "PMULL v21.1q, v9.1d, v20.1d \n" + "PMULL2 v20.1q, v9.2d, v20.2d \n" "EOR v20.16b, v20.16b, v21.16b \n" "EXT v21.16b, v18.16b, v19.16b, #8 \n" "EOR v21.16b, v21.16b, v20.16b \n" @@ -5882,7 +15655,7 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v11.16b, v18.16b, v20.16b \n" + "EOR v10.16b, v18.16b, v20.16b \n" "# Square H^4 => H^8 \n" "PMULL2 v19.1q, v26.2d, v26.2d \n" "PMULL v18.1q, v26.1d, v26.1d \n" @@ -5891,9 +15664,9 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v21.16b, v21.16b, v20.16b \n" "PMULL2 v19.1q, v21.2d, v23.2d \n" "MOV v18.D[1], v21.D[0] \n" - "EOR v4.16b, v18.16b, v19.16b \n" + "EOR v11.16b, v18.16b, v19.16b \n" - "# First encrypt - no GHASH \n" + "# First decrypt - no GHASH \n" "LDR q1, [%[Key]] \n" "# Calculate next 4 counters (+1-4) \n" "ADD w15, w12, #1 \n" @@ -6102,42 +15875,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" - "LDR q22, [%[Key], #176] \n" - "AESE v5.16b, v1.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v1.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v1.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v1.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v1.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v1.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v1.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v1.16b \n" - "AESMC v30.16b, v30.16b \n" - "LDR q1, [%[Key], #192] \n" - "AESE v5.16b, v22.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v22.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v22.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v22.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v22.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v22.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v22.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v22.16b \n" - "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #208] \n" + "LDP q22, q31, [%[Key], #176] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -6173,16 +15912,16 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v31.16b \n" "# XOR in input \n" - "EOR v12.16b, v12.16b, v5.16b \n" - "EOR v13.16b, v13.16b, v6.16b \n" - "EOR v14.16b, v14.16b, v7.16b \n" - "EOR v15.16b, v15.16b, v8.16b \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "81: \n" "LDR q1, [%[Key]] \n" @@ -6254,12 +15993,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else "EOR v20.16b, v20.16b, v3.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "# x[0-2] += C * H^3 \n" @@ -6278,12 +16013,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else "EOR v19.16b, v19.16b, v3.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "LDR q1, [%[Key], #32] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -6301,19 +16032,15 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v8.16b, v8.16b \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" - "AESE v27.16b, v22.16b \n" - "AESMC v27.16b, v27.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" "EOR v18.16b, v18.16b, v3.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "# x[0-2] += C * H^5 \n" - "PMULL v2.1q, v15.1d, v9.1d \n" - "PMULL2 v3.1q, v15.2d, v9.2d \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" "AESE v29.16b, v22.16b \n" "AESMC v29.16b, v29.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -6324,21 +16051,17 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LDR q22, [%[Key], #48] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" - "PMULL v3.1q, v15.1d, v9.1d \n" - "PMULL2 v15.1q, v15.2d, v9.2d \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else "EOR v15.16b, v15.16b, v3.16b \n" "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" "# x[0-2] += C * H^6 \n" - "PMULL v2.1q, v14.1d, v10.1d \n" - "PMULL2 v3.1q, v14.2d, v10.2d \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -6348,21 +16071,17 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v14.16b, v14.16b, v14.16b, #8 \n" "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" - "PMULL v3.1q, v14.1d, v10.1d \n" - "PMULL2 v14.1q, v14.2d, v10.2d \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else "EOR v14.16b, v14.16b, v3.16b \n" "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^7 \n" - "PMULL v2.1q, v13.1d, v11.1d \n" - "PMULL2 v3.1q, v13.2d, v11.2d \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" "LDR q1, [%[Key], #64] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -6373,21 +16092,17 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v13.16b, v13.16b, v13.16b, #8 \n" "AESE v7.16b, v22.16b \n" "AESMC v7.16b, v7.16b \n" - "PMULL v3.1q, v13.1d, v11.1d \n" - "PMULL2 v13.1q, v13.2d, v11.2d \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" "AESE v8.16b, v22.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else "EOR v13.16b, v13.16b, v3.16b \n" "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" "# x[0-2] += C * H^8 \n" - "PMULL v2.1q, v12.1d, v4.1d \n" - "PMULL2 v3.1q, v12.2d, v4.2d \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -6397,17 +16112,13 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" - "PMULL v3.1q, v12.1d, v4.1d \n" - "PMULL2 v12.1q, v12.2d, v4.2d \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" "LDR q22, [%[Key], #80] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else "EOR v12.16b, v12.16b, v3.16b \n" "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" "# Reduce X = x[0-2] \n" @@ -6417,16 +16128,10 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v2.1q, v0.2d, v23.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v1.16b \n" "AESMC v27.16b, v27.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" "PMULL2 v2.1q, v3.2d, v23.2d \n" @@ -6522,42 +16227,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" - "LDR q22, [%[Key], #176] \n" - "AESE v5.16b, v1.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v1.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v1.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v1.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v1.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v1.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v1.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v1.16b \n" - "AESMC v30.16b, v30.16b \n" - "LDR q1, [%[Key], #192] \n" - "AESE v5.16b, v22.16b \n" - "AESMC v5.16b, v5.16b \n" - "AESE v6.16b, v22.16b \n" - "AESMC v6.16b, v6.16b \n" - "AESE v7.16b, v22.16b \n" - "AESMC v7.16b, v7.16b \n" - "AESE v8.16b, v22.16b \n" - "AESMC v8.16b, v8.16b \n" - "AESE v27.16b, v22.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v22.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v22.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v22.16b \n" - "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #208] \n" + "LDP q22, q31, [%[Key], #176] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -6593,16 +16264,16 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v30.16b, v30.16b, v31.16b \n" "# XOR in input \n" - "EOR v12.16b, v12.16b, v5.16b \n" - "EOR v13.16b, v13.16b, v6.16b \n" - "EOR v14.16b, v14.16b, v7.16b \n" - "EOR v15.16b, v15.16b, v8.16b \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #128 \n" "BGE 81b \n" @@ -6632,12 +16303,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v3.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else "EOR v20.16b, v20.16b, v3.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v2.1q, v19.1d, v25.1d \n" "PMULL2 v3.1q, v19.2d, v25.2d \n" @@ -6646,12 +16313,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v3.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else "EOR v19.16b, v19.16b, v3.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v2.1q, v18.1d, v26.1d \n" "PMULL2 v3.1q, v18.2d, v26.2d \n" @@ -6660,79 +16323,53 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else "EOR v18.16b, v18.16b, v3.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^5 \n" - "PMULL v2.1q, v15.1d, v9.1d \n" - "PMULL2 v3.1q, v15.2d, v9.2d \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v15.16b, v15.16b, v15.16b, #8 \n" - "PMULL v3.1q, v15.1d, v9.1d \n" - "PMULL2 v15.1q, v15.2d, v9.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" "EOR v15.16b, v15.16b, v3.16b \n" "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^6 \n" - "PMULL v2.1q, v14.1d, v10.1d \n" - "PMULL2 v3.1q, v14.2d, v10.2d \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v14.16b, v14.16b, v14.16b, #8 \n" - "PMULL v3.1q, v14.1d, v10.1d \n" - "PMULL2 v14.1q, v14.2d, v10.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" "EOR v14.16b, v14.16b, v3.16b \n" "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^7 \n" - "PMULL v2.1q, v13.1d, v11.1d \n" - "PMULL2 v3.1q, v13.2d, v11.2d \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "PMULL v3.1q, v13.1d, v11.1d \n" - "PMULL2 v13.1q, v13.2d, v11.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" "EOR v13.16b, v13.16b, v3.16b \n" "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^8 \n" - "PMULL v2.1q, v12.1d, v4.1d \n" - "PMULL2 v3.1q, v12.2d, v4.2d \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" "EOR v17.16b, v17.16b, v2.16b \n" "EOR v0.16b, v0.16b, v3.16b \n" "EXT v12.16b, v12.16b, v12.16b, #8 \n" - "PMULL v3.1q, v12.1d, v4.1d \n" - "PMULL2 v12.1q, v12.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" "EOR v12.16b, v12.16b, v3.16b \n" "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v3.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v2.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v2.1q, v3.2d, v23.2d \n" "MOV v17.D[1], v3.D[0] \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -6747,7 +16384,7 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CMP w11, #64 \n" "BLT 10f \n" - "# First encrypt - no GHASH \n" + "# First decrypt - no GHASH \n" "# Calculate next 4 counters (+1-4) \n" "ADD w15, w12, #1 \n" "MOV v27.16b, v22.16b \n" @@ -6840,6 +16477,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v10.16b \n" @@ -6856,41 +16495,22 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v11.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v12.16b \n" - "AESMC v27.16b, v27.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" "AESE v28.16b, v12.16b \n" - "AESMC v28.16b, v28.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" "AESE v29.16b, v12.16b \n" - "AESMC v29.16b, v29.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" "AESE v30.16b, v12.16b \n" - "AESMC v30.16b, v30.16b \n" - "LD1 {v14.2d, v15.2d}, [%[Key]] \n" - "AESE v27.16b, v13.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v13.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v13.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v13.16b \n" - "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v14.16b \n" - "EOR v27.16b, v27.16b, v15.16b \n" - "AESE v28.16b, v14.16b \n" - "EOR v28.16b, v28.16b, v15.16b \n" - "AESE v29.16b, v14.16b \n" - "EOR v29.16b, v29.16b, v15.16b \n" - "AESE v30.16b, v14.16b \n" - "EOR v30.16b, v30.16b, v15.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" "# XOR in input \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" "# Store cipher text \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #64 \n" "BLT 12f \n" @@ -6955,12 +16575,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v28.16b, v3.16b \n" "AESMC v28.16b, v28.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v29.16b, v3.16b \n" "AESMC v29.16b, v29.16b \n" "# x[0-2] += C * H^3 \n" @@ -6979,12 +16595,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v29.16b, v4.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v4.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^4 \n" @@ -7003,12 +16615,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v30.16b, v5.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "SUB w11, w11, #64 \n" "AESE v27.16b, v6.16b \n" "AESMC v27.16b, v27.16b \n" @@ -7019,16 +16627,10 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v0.2d, v23.2d \n" "AESE v29.16b, v6.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v6.16b \n" "AESMC v30.16b, v30.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v7.16b \n" "AESMC v27.16b, v27.16b \n" "PMULL2 v14.1q, v15.2d, v23.2d \n" @@ -7056,57 +16658,40 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v10.16b \n" "AESMC v28.16b, v28.16b \n" "AESE v29.16b, v10.16b \n" "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v10.16b \n" - "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v11.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v11.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v11.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v11.16b \n" - "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" - "AESE v27.16b, v12.16b \n" - "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v12.16b \n" - "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v12.16b \n" - "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v12.16b \n" + "AESE v30.16b, v10.16b \n" "AESMC v30.16b, v30.16b \n" - "LD1 {v14.2d, v15.2d}, [%[Key]] \n" - "AESE v27.16b, v13.16b \n" + "AESE v27.16b, v11.16b \n" "AESMC v27.16b, v27.16b \n" - "AESE v28.16b, v13.16b \n" + "AESE v28.16b, v11.16b \n" "AESMC v28.16b, v28.16b \n" - "AESE v29.16b, v13.16b \n" + "AESE v29.16b, v11.16b \n" "AESMC v29.16b, v29.16b \n" - "AESE v30.16b, v13.16b \n" + "AESE v30.16b, v11.16b \n" "AESMC v30.16b, v30.16b \n" - "AESE v27.16b, v14.16b \n" - "EOR v27.16b, v27.16b, v15.16b \n" - "AESE v28.16b, v14.16b \n" - "EOR v28.16b, v28.16b, v15.16b \n" - "AESE v29.16b, v14.16b \n" - "EOR v29.16b, v29.16b, v15.16b \n" - "AESE v30.16b, v14.16b \n" - "EOR v30.16b, v30.16b, v15.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" "# XOR in input \n" - "EOR v18.16b, v18.16b, v27.16b \n" - "EOR v19.16b, v19.16b, v28.16b \n" - "EOR v20.16b, v20.16b, v29.16b \n" - "EOR v21.16b, v21.16b, v30.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" "# Store cipher text \n" - "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" "CMP w11, #64 \n" "BGE 11b \n" @@ -7132,12 +16717,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -7146,12 +16727,8 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -7160,33 +16737,22 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v14.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" "10: \n" - "SUB %[Key], %[Key], #32 \n" "CBZ w11, 30f \n" "CMP w11, #16 \n" "BLT 20f \n" - "# Encrypt first block for GHASH \n" + "# Decrypt first block for GHASH \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" @@ -7208,39 +16774,30 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v31.2d}, [%[input]], #16 \n" + "LD1 {v28.2d}, [%[input]], #16 \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v10.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" - "AESE v0.16b, v12.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v13.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]] \n" - "SUB %[Key], %[Key], #32 \n" "AESE v0.16b, v12.16b \n" "EOR v0.16b, v0.16b, v13.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[out]], #16 \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" - "# When only one full block to encrypt go straight to GHASH \n" + "# When only one full block to decrypt go straight to GHASH \n" "CMP w11, 16 \n" "BLT 1f \n" - "LD1 {v31.2d}, [%[input]], #16 \n" - - "# Interweave GHASH and encrypt if more then 1 block \n" + "# Interweave GHASH and decrypt if more then 1 block \n" "2: \n" - "RBIT v15.16b, v15.16b \n" + "RBIT v28.16b, v28.16b \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" "MOV v0.S[3], w13 \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "AESE v0.16b, v1.16b \n" @@ -7270,6 +16827,7 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" "PMULL2 v20.1q, v21.2d, v23.2d \n" + "LD1 {v28.2d}, [%[input]], #16 \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "MOV v18.D[1], v21.D[0] \n" @@ -7278,27 +16836,17 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EOR v17.16b, v18.16b, v20.16b \n" "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" - "AESE v0.16b, v12.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v13.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]] \n" - "SUB %[Key], %[Key], #32 \n" "AESE v0.16b, v12.16b \n" "EOR v0.16b, v0.16b, v13.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[out]], #16 \n" - "CMP w11, 16 \n" - "BLT 1f \n" - - "LD1 {v31.2d}, [%[input]], #16 \n" - "B 2b \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "CMP w11, #16 \n" + "BGE 2b \n" "# GHASH on last block \n" "1: \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" @@ -7326,89 +16874,75 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "CBNZ x15, 23b \n" "SUB %[scratch], %[scratch], x11 \n" "LD1 {v31.2d}, [%[scratch]] \n" + "RBIT v31.16b, v31.16b \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" "REV w13, w12 \n" "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v31.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" "AESE v0.16b, v1.16b \n" "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" "AESE v0.16b, v2.16b \n" "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" "AESE v0.16b, v3.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" "AESE v0.16b, v4.16b \n" "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" "AESE v0.16b, v5.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" "AESE v0.16b, v6.16b \n" "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" "AESE v0.16b, v7.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" "AESE v0.16b, v8.16b \n" "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "RBIT v31.16b, v31.16b \n" "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" - "AESE v0.16b, v12.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v13.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]] \n" - "SUB %[Key], %[Key], #32 \n" "AESE v0.16b, v12.16b \n" "EOR v0.16b, v0.16b, v13.16b \n \n" - "EOR v15.16b, v0.16b, v31.16b \n \n" - "ST1 {v15.2d}, [%[scratch]] \n" + "EOR v0.16b, v0.16b, v31.16b \n \n" + "ST1 {v0.2d}, [%[scratch]] \n" "MOV x15, x11 \n" "24: \n" "LDRB w14, [%[scratch]], #1 \n" "STRB w14, [%[out]], #1 \n" "SUB x15, x15, #1 \n" "CBNZ x15, 24b \n" - "MOV x15, #16 \n" - "EOR w14, w14, w14 \n" - "SUB x15, x15, x11 \n" - "25: \n" - "STRB w14, [%[scratch]], #1 \n" - "SUB x15, x15, #1 \n" - "CBNZ x15, 25b \n" - "SUB %[scratch], %[scratch], #16 \n" - "LD1 {v15.2d}, [%[scratch]] \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" - "PMULL v18.1q, v17.1d, v16.1d \n" - "PMULL2 v19.1q, v17.2d, v16.2d \n" - "EXT v20.16b, v16.16b, v16.16b, #8 \n" - "PMULL v21.1q, v17.1d, v20.1d \n" - "PMULL2 v20.1q, v17.2d, v20.2d \n" - "EOR v20.16b, v20.16b, v21.16b \n" - "EXT v21.16b, v18.16b, v19.16b, #8 \n" - "EOR v21.16b, v21.16b, v20.16b \n" - "# Reduce \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "EOR v21.16b, v21.16b, v20.16b \n" - "PMULL2 v20.1q, v21.2d, v23.2d \n" - "MOV v18.D[1], v21.D[0] \n" - "EOR v17.16b, v18.16b, v20.16b \n" + "SUB %[scratch], %[scratch], x11 \n" "30: \n" "# store current counter value at the end \n" "REV w13, w12 \n" "MOV v22.S[3], w13 \n" - "LD1 {v0.2d}, [%[ctr]] \n" - "ST1 {v22.2d}, [%[ctr]] \n" + "LD1 {v0.16b}, [%[ctr]] \n" + "ST1 {v22.16b}, [%[ctr]] \n" "LSL %x[aSz], %x[aSz], #3 \n" "LSL %x[sz], %x[sz], #3 \n" - "MOV v15.d[0], %x[aSz] \n" - "MOV v15.d[1], %x[sz] \n" - "REV64 v15.16b, v15.16b \n" - "RBIT v15.16b, v15.16b \n" - "EOR v17.16b, v17.16b, v15.16b \n" + "MOV v28.d[0], %x[aSz] \n" + "MOV v28.d[1], %x[sz] \n" + "REV64 v28.16b, v28.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" "PMULL v18.1q, v17.1d, v16.1d \n" "PMULL2 v19.1q, v17.2d, v16.2d \n" "EXT v20.16b, v16.16b, v16.16b, #8 \n" @@ -7445,96 +16979,64 @@ static void Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" - "AESE v0.16b, v12.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v13.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d, v13.2d}, [%[Key]] \n" - "SUB %[Key], %[Key], #32 \n" "AESE v0.16b, v12.16b \n" "EOR v0.16b, v0.16b, v13.16b \n \n" "RBIT v17.16b, v17.16b \n" "EOR v0.16b, v0.16b, v17.16b \n \n" "CMP %w[tagSz], #16 \n" "BNE 40f \n" - "ST1 {v0.2d}, [%[tag]] \n" + "LD1 {v1.2d}, [%[tag]] \n" "B 41f \n" "40: \n" - "ST1 {v0.2d}, [%[scratch]] \n" + "EOR v1.16b, v1.16b, v1.16b \n" "MOV x15, %x[tagSz] \n" - "44: \n" - "LDRB w14, [%[scratch]], #1 \n" - "STRB w14, [%[tag]], #1 \n" + "ST1 {v1.2d}, [%[scratch]] \n" + "43: \n" + "LDRB w14, [%[tag]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" "SUB x15, x15, #1 \n" - "CBNZ x15, 44b \n" + "CBNZ x15, 43b \n" "SUB %[scratch], %[scratch], %x[tagSz] \n" - "41: \n" - - : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), - [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) - : [ctr] "r" (ctr), [scratch] "r" (scratch), - [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) - : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); -} -#endif /* WOLFSSL_AES_256 */ - -/* aarch64 with PMULL and PMULL2 - * Encrypt and tag data using AES with GCM mode. - * aes: Aes structure having already been set with set key function - * out: encrypted data output buffer - * in: plain text input buffer - * sz: size of plain text and out buffer - * iv: initialization vector - * ivSz: size of iv buffer - * authTag: buffer to hold tag - * authTagSz: size of tag buffer - * authIn: additional data buffer - * authInSz: size of additional data buffer - * - * Notes: - * GHASH multiplication based from Algorithm 1 from Intel GCM white paper. - * "Carry-Less Multiplication and Its Usage for Computing the GCM Mode" - * - * GHASH reduction Based from White Paper "Implementing GCM on ARMv8" - * by Conrado P.L. Gouvea and Julio Lopez reduction on 256bit value using - * Algorithm 5 - */ -void AES_GCM_encrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) -{ - switch (aes->rounds) { -#ifdef WOLFSSL_AES_128 - case 10: - Aes128GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz); - break; -#endif -#ifdef WOLFSSL_AES_192 - case 12: - Aes192GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz); - break; -#endif -#ifdef WOLFSSL_AES_256 - case 14: - Aes256GcmEncrypt(aes, out, in, sz, iv, ivSz, authTag, authTagSz, - authIn, authInSz); - break; -#endif - } -} + "LD1 {v1.2d}, [%[scratch]] \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV w14, #16 \n" + "SUB w14, w14, %w[tagSz] \n" + "ADD %[scratch], %[scratch], %x[tagSz] \n" + "44: \n" + "STRB wzr, [%[scratch]], #1 \n" + "SUB w14, w14, #1 \n" + "CBNZ w14, 44b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v0.2d}, [%[scratch]] \n" + "41: \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "MOV v1.D[0], v0.D[1] \n" + "EOR v0.8b, v0.8b, v1.8b \n" + "MOV %x[ret], v0.D[0] \n" + "CMP %x[ret], #0 \n" + "MOV w11, #-180 \n" + "CSETM %w[ret], ne \n" + "AND %w[ret], %w[ret], w11 \n" -#ifdef HAVE_AES_DECRYPT -#ifdef WOLFSSL_AES_128 -/* internal function : see wc_AesGcmDecrypt */ -static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn), + [ret] "+r" (ret) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "x11", "x12", "w13", "x14", "x15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + return ret; +} +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 +#ifdef WOLFSSL_AES_192 +/* internal function : see AES_GCM_decrypt_AARCH64 */ +static int Aes192GcmDecrypt_EOR3(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -7625,12 +17127,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -7639,12 +17136,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -7653,23 +17145,11 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v30.16b, #8 \n" "PMULL2 v14.1q, v30.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -8004,8 +17484,42 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #144] \n" + "LDP q22, q31, [%[Key], #176] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -8122,12 +17636,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else - "EOR v20.16b, v20.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "# x[0-2] += C * H^3 \n" @@ -8146,12 +17655,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else - "EOR v19.16b, v19.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "LDR q1, [%[Key], #32] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -8171,12 +17675,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else - "EOR v18.16b, v18.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "# x[0-2] += C * H^5 \n" @@ -8196,12 +17695,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v15.1q, v15.2d, v4.2d \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else - "EOR v15.16b, v15.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" "# x[0-2] += C * H^6 \n" @@ -8220,12 +17714,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v14.2d, v9.2d \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else - "EOR v14.16b, v14.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^7 \n" @@ -8245,12 +17734,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v13.1q, v13.2d, v10.2d \n" "AESE v8.16b, v22.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else - "EOR v13.16b, v13.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" "# x[0-2] += C * H^8 \n" @@ -8270,12 +17754,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LDR q22, [%[Key], #80] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else - "EOR v12.16b, v12.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" "# Reduce X = x[0-2] \n" @@ -8285,16 +17764,9 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v2.1q, v0.2d, v23.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else - "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v1.16b \n" "AESMC v27.16b, v27.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" "PMULL2 v2.1q, v3.2d, v23.2d \n" @@ -8356,8 +17828,42 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #144] \n" + "LDP q22, q31, [%[Key], #176] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -8432,12 +17938,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v3.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else - "EOR v20.16b, v20.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v2.1q, v19.1d, v25.1d \n" "PMULL2 v3.1q, v19.2d, v25.2d \n" @@ -8446,12 +17947,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v3.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else - "EOR v19.16b, v19.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v2.1q, v18.1d, v26.1d \n" "PMULL2 v3.1q, v18.2d, v26.2d \n" @@ -8460,12 +17956,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else - "EOR v18.16b, v18.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^5 \n" "PMULL v2.1q, v15.1d, v4.1d \n" "PMULL2 v3.1q, v15.2d, v4.2d \n" @@ -8474,12 +17965,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v15.16b, v15.16b, v15.16b, #8 \n" "PMULL v3.1q, v15.1d, v4.1d \n" "PMULL2 v15.1q, v15.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else - "EOR v15.16b, v15.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^6 \n" "PMULL v2.1q, v14.1d, v9.1d \n" "PMULL2 v3.1q, v14.2d, v9.2d \n" @@ -8488,12 +17974,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v14.16b, v14.16b, v14.16b, #8 \n" "PMULL v3.1q, v14.1d, v9.1d \n" "PMULL2 v14.1q, v14.2d, v9.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else - "EOR v14.16b, v14.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^7 \n" "PMULL v2.1q, v13.1d, v10.1d \n" "PMULL2 v3.1q, v13.2d, v10.2d \n" @@ -8502,12 +17983,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v13.16b, v13.16b, v13.16b, #8 \n" "PMULL v3.1q, v13.1d, v10.1d \n" "PMULL2 v13.1q, v13.2d, v10.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else - "EOR v13.16b, v13.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^8 \n" "PMULL v2.1q, v12.1d, v11.1d \n" "PMULL2 v3.1q, v12.2d, v11.2d \n" @@ -8516,23 +17992,11 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "PMULL v3.1q, v12.1d, v11.1d \n" "PMULL2 v12.1q, v12.2d, v11.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else - "EOR v12.16b, v12.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v3.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v2.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else - "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v2.1q, v3.2d, v23.2d \n" "MOV v17.D[1], v3.D[0] \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -8542,6 +18006,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" "# Can we do 4 blocks at a time? \n" "CMP w11, #64 \n" "BLT 10f \n" @@ -8623,8 +18088,6 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v7.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v8.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v8.16b \n" @@ -8641,14 +18104,32 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" - "EOR v27.16b, v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v10.16b \n" - "EOR v28.16b, v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" "AESE v29.16b, v10.16b \n" - "EOR v29.16b, v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v10.16b \n" - "EOR v30.16b, v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" "# XOR in input \n" "EOR v27.16b, v27.16b, v18.16b \n" @@ -8721,12 +18202,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v28.16b, v3.16b \n" "AESMC v28.16b, v28.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v29.16b, v3.16b \n" "AESMC v29.16b, v29.16b \n" "# x[0-2] += C * H^3 \n" @@ -8745,12 +18221,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v29.16b, v4.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v4.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^4 \n" @@ -8769,12 +18240,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v30.16b, v5.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "SUB w11, w11, #64 \n" "AESE v27.16b, v6.16b \n" "AESMC v27.16b, v27.16b \n" @@ -8785,16 +18251,9 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v0.2d, v23.2d \n" "AESE v29.16b, v6.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v6.16b \n" "AESMC v30.16b, v30.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v7.16b \n" "AESMC v27.16b, v27.16b \n" "PMULL2 v14.1q, v15.2d, v23.2d \n" @@ -8806,8 +18265,6 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v7.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v8.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v8.16b \n" @@ -8824,14 +18281,32 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" - "EOR v27.16b, v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v10.16b \n" - "EOR v28.16b, v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" "AESE v29.16b, v10.16b \n" - "EOR v29.16b, v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v10.16b \n" - "EOR v30.16b, v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" "# XOR in input \n" "EOR v27.16b, v27.16b, v18.16b \n" @@ -8865,12 +18340,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -8879,12 +18349,7 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -8893,23 +18358,11 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v14.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -8944,7 +18397,11 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v9.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" "EOR v0.16b, v0.16b, v28.16b \n \n" "ST1 {v0.2d}, [%[out]], #16 \n" @@ -8994,8 +18451,12 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n \n" + "AESMC v0.16b, v0.16b \n" "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" "EOR v0.16b, v0.16b, v28.16b \n \n" "ST1 {v0.2d}, [%[out]], #16 \n" "CMP w11, #16 \n" @@ -9071,8 +18532,12 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "MOV v18.D[1], v21.D[0] \n" "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n \n" + "AESMC v0.16b, v0.16b \n" "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" "EOR v0.16b, v0.16b, v31.16b \n \n" "ST1 {v0.2d}, [%[scratch]] \n" "MOV x15, x11 \n" @@ -9130,7 +18595,11 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v0.16b, v0.16b \n" "EOR v17.16b, v18.16b, v20.16b \n" "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" "RBIT v17.16b, v17.16b \n" "EOR v0.16b, v0.16b, v17.16b \n \n" "CMP %w[tagSz], #16 \n" @@ -9182,10 +18651,11 @@ static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, return ret; } -#endif /* WOLFSSL_AES_128 */ -#ifdef WOLFSSL_AES_192 -/* internal function : see wc_AesGcmDecrypt */ -static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, +#endif /* WOLFSSL_AES_192 */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifdef WOLFSSL_AES_256 +/* internal function : see AES_GCM_decrypt_AARCH64 */ +static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -9276,12 +18746,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -9290,12 +18756,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -9304,23 +18766,13 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v30.16b, #8 \n" "PMULL2 v14.1q, v30.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -9689,8 +19141,42 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #176] \n" + "LDP q22, q31, [%[Key], #208] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -9807,12 +19293,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else "EOR v20.16b, v20.16b, v3.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "# x[0-2] += C * H^3 \n" @@ -9831,12 +19313,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else "EOR v19.16b, v19.16b, v3.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "LDR q1, [%[Key], #32] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -9856,12 +19334,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else "EOR v18.16b, v18.16b, v3.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "# x[0-2] += C * H^5 \n" @@ -9881,12 +19355,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v15.1q, v15.2d, v4.2d \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else "EOR v15.16b, v15.16b, v3.16b \n" "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" "# x[0-2] += C * H^6 \n" @@ -9905,12 +19375,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v14.2d, v9.2d \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else "EOR v14.16b, v14.16b, v3.16b \n" "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^7 \n" @@ -9930,12 +19396,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v13.1q, v13.2d, v10.2d \n" "AESE v8.16b, v22.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else "EOR v13.16b, v13.16b, v3.16b \n" "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" "# x[0-2] += C * H^8 \n" @@ -9955,12 +19417,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LDR q22, [%[Key], #80] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else "EOR v12.16b, v12.16b, v3.16b \n" "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" "# Reduce X = x[0-2] \n" @@ -9970,16 +19428,10 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v2.1q, v0.2d, v23.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v1.16b \n" "AESMC v27.16b, v27.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" "PMULL2 v2.1q, v3.2d, v23.2d \n" @@ -10075,8 +19527,42 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v22.16b \n" "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" - "LDP q22, q31, [%[Key], #176] \n" + "LDP q22, q31, [%[Key], #208] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" "AESE v6.16b, v1.16b \n" @@ -10151,12 +19637,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v3.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else "EOR v20.16b, v20.16b, v3.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v2.1q, v19.1d, v25.1d \n" "PMULL2 v3.1q, v19.2d, v25.2d \n" @@ -10165,12 +19647,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v3.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else "EOR v19.16b, v19.16b, v3.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v2.1q, v18.1d, v26.1d \n" "PMULL2 v3.1q, v18.2d, v26.2d \n" @@ -10179,12 +19657,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else "EOR v18.16b, v18.16b, v3.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^5 \n" "PMULL v2.1q, v15.1d, v4.1d \n" "PMULL2 v3.1q, v15.2d, v4.2d \n" @@ -10193,12 +19667,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v15.16b, v15.16b, v15.16b, #8 \n" "PMULL v3.1q, v15.1d, v4.1d \n" "PMULL2 v15.1q, v15.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else "EOR v15.16b, v15.16b, v3.16b \n" "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^6 \n" "PMULL v2.1q, v14.1d, v9.1d \n" "PMULL2 v3.1q, v14.2d, v9.2d \n" @@ -10207,12 +19677,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v14.16b, v14.16b, v14.16b, #8 \n" "PMULL v3.1q, v14.1d, v9.1d \n" "PMULL2 v14.1q, v14.2d, v9.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else "EOR v14.16b, v14.16b, v3.16b \n" "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^7 \n" "PMULL v2.1q, v13.1d, v10.1d \n" "PMULL2 v3.1q, v13.2d, v10.2d \n" @@ -10221,12 +19687,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v13.16b, v13.16b, v13.16b, #8 \n" "PMULL v3.1q, v13.1d, v10.1d \n" "PMULL2 v13.1q, v13.2d, v10.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else "EOR v13.16b, v13.16b, v3.16b \n" "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^8 \n" "PMULL v2.1q, v12.1d, v11.1d \n" "PMULL2 v3.1q, v12.2d, v11.2d \n" @@ -10235,23 +19697,13 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "PMULL v3.1q, v12.1d, v11.1d \n" "PMULL2 v12.1q, v12.2d, v11.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else "EOR v12.16b, v12.16b, v3.16b \n" "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v3.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v2.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v2.1q, v3.2d, v23.2d \n" "MOV v17.D[1], v3.D[0] \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -10262,6 +19714,7 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "LD1 {v14.2d-v15.2d}, [%[Key]] \n" "# Can we do 4 blocks at a time? \n" "CMP w11, #64 \n" "BLT 10f \n" @@ -10359,8 +19812,6 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v10.16b \n" @@ -10377,14 +19828,33 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v11.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v12.16b \n" - "EOR v27.16b, v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v12.16b \n" - "EOR v28.16b, v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" "AESE v29.16b, v12.16b \n" - "EOR v29.16b, v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v12.16b \n" - "EOR v30.16b, v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" "# XOR in input \n" "EOR v27.16b, v27.16b, v18.16b \n" @@ -10457,12 +19927,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v28.16b, v3.16b \n" "AESMC v28.16b, v28.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v29.16b, v3.16b \n" "AESMC v29.16b, v29.16b \n" "# x[0-2] += C * H^3 \n" @@ -10481,12 +19947,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v29.16b, v4.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v4.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^4 \n" @@ -10505,12 +19967,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v30.16b, v5.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "SUB w11, w11, #64 \n" "AESE v27.16b, v6.16b \n" "AESMC v27.16b, v27.16b \n" @@ -10521,16 +19979,10 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v0.2d, v23.2d \n" "AESE v29.16b, v6.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v6.16b \n" "AESMC v30.16b, v30.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v7.16b \n" "AESMC v27.16b, v27.16b \n" "PMULL2 v14.1q, v15.2d, v23.2d \n" @@ -10558,8 +20010,6 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v9.16b \n" "AESMC v30.16b, v30.16b \n" - "# Load plaintext \n" - "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v10.16b \n" "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v10.16b \n" @@ -10576,14 +20026,33 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v11.16b \n" "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" "AESE v27.16b, v12.16b \n" - "EOR v27.16b, v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" "AESE v28.16b, v12.16b \n" - "EOR v28.16b, v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" "AESE v29.16b, v12.16b \n" - "EOR v29.16b, v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" "AESE v30.16b, v12.16b \n" - "EOR v30.16b, v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" "# XOR in input \n" "EOR v27.16b, v27.16b, v18.16b \n" @@ -10617,12 +20086,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else "EOR v20.16b, v20.16b, v15.16b \n" "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -10631,12 +20096,8 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else "EOR v19.16b, v19.16b, v15.16b \n" "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -10645,31 +20106,23 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else "EOR v18.16b, v18.16b, v15.16b \n" "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v14.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" "10: \n" "CBZ w11, 30f \n" "CMP w11, #16 \n" "BLT 20f \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" "# Decrypt first block for GHASH \n" "ADD w12, w12, #1 \n" "MOV v0.16b, v22.16b \n" @@ -10700,7 +20153,11 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" "EOR v0.16b, v0.16b, v28.16b \n \n" "ST1 {v0.2d}, [%[out]], #16 \n" @@ -10755,7 +20212,11 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" "EOR v0.16b, v0.16b, v28.16b \n \n" "ST1 {v0.2d}, [%[out]], #16 \n" "CMP w11, #16 \n" @@ -10836,7 +20297,11 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" "EOR v0.16b, v0.16b, v31.16b \n \n" "ST1 {v0.2d}, [%[scratch]] \n" "MOV x15, x11 \n" @@ -10898,7 +20363,11 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "AESE v0.16b, v11.16b \n" "AESMC v0.16b, v0.16b \n" "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" "RBIT v17.16b, v17.16b \n" "EOR v0.16b, v0.16b, v17.16b \n \n" "CMP %w[tagSz], #16 \n" @@ -10950,10 +20419,11 @@ static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, return ret; } -#endif /* WOLFSSL_AES_192 */ +#endif /* WOLFSSL_AES_256 */ +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 #ifdef WOLFSSL_AES_256 -/* internal function : see wc_AesGcmDecrypt */ -static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, +/* internal function : see AES_GCM_decrypt_AARCH64 */ +static int Aes256GcmDecrypt_EOR3(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { @@ -11044,12 +20514,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -11058,12 +20523,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -11072,23 +20532,11 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v30.16b, #8 \n" "PMULL2 v14.1q, v30.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -11609,12 +21057,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else - "EOR v20.16b, v20.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" "# x[0-2] += C * H^3 \n" @@ -11633,12 +21076,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else - "EOR v19.16b, v19.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "LDR q1, [%[Key], #32] \n" "AESE v5.16b, v22.16b \n" "AESMC v5.16b, v5.16b \n" @@ -11658,12 +21096,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else - "EOR v18.16b, v18.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v22.16b \n" "AESMC v28.16b, v28.16b \n" "# x[0-2] += C * H^5 \n" @@ -11683,12 +21116,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v15.1q, v15.2d, v4.2d \n" "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else - "EOR v15.16b, v15.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v7.16b, v1.16b \n" "AESMC v7.16b, v7.16b \n" "# x[0-2] += C * H^6 \n" @@ -11707,12 +21135,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v14.2d, v9.2d \n" "AESE v29.16b, v1.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else - "EOR v14.16b, v14.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v1.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^7 \n" @@ -11732,12 +21155,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v13.1q, v13.2d, v10.2d \n" "AESE v8.16b, v22.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else - "EOR v13.16b, v13.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v22.16b \n" "AESMC v27.16b, v27.16b \n" "# x[0-2] += C * H^8 \n" @@ -11757,12 +21175,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "LDR q22, [%[Key], #80] \n" "AESE v5.16b, v1.16b \n" "AESMC v5.16b, v5.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else - "EOR v12.16b, v12.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v6.16b, v1.16b \n" "AESMC v6.16b, v6.16b \n" "# Reduce X = x[0-2] \n" @@ -11772,16 +21185,9 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v2.1q, v0.2d, v23.2d \n" "AESE v8.16b, v1.16b \n" "AESMC v8.16b, v8.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else - "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v1.16b \n" "AESMC v27.16b, v27.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v28.16b, v1.16b \n" "AESMC v28.16b, v28.16b \n" "PMULL2 v2.1q, v3.2d, v23.2d \n" @@ -11987,12 +21393,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v3.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" -#else - "EOR v20.16b, v20.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v2.1q, v19.1d, v25.1d \n" "PMULL2 v3.1q, v19.2d, v25.2d \n" @@ -12001,12 +21402,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v3.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" -#else - "EOR v19.16b, v19.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v2.1q, v18.1d, v26.1d \n" "PMULL2 v3.1q, v18.2d, v26.2d \n" @@ -12015,12 +21411,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v3.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" -#else - "EOR v18.16b, v18.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^5 \n" "PMULL v2.1q, v15.1d, v4.1d \n" "PMULL2 v3.1q, v15.2d, v4.2d \n" @@ -12029,12 +21420,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v15.16b, v15.16b, v15.16b, #8 \n" "PMULL v3.1q, v15.1d, v4.1d \n" "PMULL2 v15.1q, v15.2d, v4.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" -#else - "EOR v15.16b, v15.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v15.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^6 \n" "PMULL v2.1q, v14.1d, v9.1d \n" "PMULL2 v3.1q, v14.2d, v9.2d \n" @@ -12043,12 +21429,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v14.16b, v14.16b, v14.16b, #8 \n" "PMULL v3.1q, v14.1d, v9.1d \n" "PMULL2 v14.1q, v14.2d, v9.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" -#else - "EOR v14.16b, v14.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^7 \n" "PMULL v2.1q, v13.1d, v10.1d \n" "PMULL2 v3.1q, v13.2d, v10.2d \n" @@ -12057,12 +21438,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v13.16b, v13.16b, v13.16b, #8 \n" "PMULL v3.1q, v13.1d, v10.1d \n" "PMULL2 v13.1q, v13.2d, v10.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" -#else - "EOR v13.16b, v13.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v13.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^8 \n" "PMULL v2.1q, v12.1d, v11.1d \n" "PMULL2 v3.1q, v12.2d, v11.2d \n" @@ -12071,23 +21447,11 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v12.16b, v12.16b, v12.16b, #8 \n" "PMULL v3.1q, v12.1d, v11.1d \n" "PMULL2 v12.1q, v12.2d, v11.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" -#else - "EOR v12.16b, v12.16b, v3.16b \n" - "EOR v31.16b, v31.16b, v12.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v3.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v2.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" -#else - "EOR v3.16b, v3.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v3.16b, v3.16b, v2.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v2.1q, v3.2d, v23.2d \n" "MOV v17.D[1], v3.D[0] \n" "EOR v17.16b, v17.16b, v2.16b \n" @@ -12311,12 +21675,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v20.1q, v20.2d, v24.2d \n" "AESE v28.16b, v3.16b \n" "AESMC v28.16b, v28.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v29.16b, v3.16b \n" "AESMC v29.16b, v29.16b \n" "# x[0-2] += C * H^3 \n" @@ -12335,12 +21694,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v19.1q, v19.2d, v25.2d \n" "AESE v29.16b, v4.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v4.16b \n" "AESMC v30.16b, v30.16b \n" "# x[0-2] += C * H^4 \n" @@ -12359,12 +21713,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v18.1q, v18.2d, v26.2d \n" "AESE v30.16b, v5.16b \n" "AESMC v30.16b, v30.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "SUB w11, w11, #64 \n" "AESE v27.16b, v6.16b \n" "AESMC v27.16b, v27.16b \n" @@ -12375,16 +21724,9 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "PMULL2 v14.1q, v0.2d, v23.2d \n" "AESE v29.16b, v6.16b \n" "AESMC v29.16b, v29.16b \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v30.16b, v6.16b \n" "AESMC v30.16b, v30.16b \n" -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "AESE v27.16b, v7.16b \n" "AESMC v27.16b, v27.16b \n" "PMULL2 v14.1q, v15.2d, v23.2d \n" @@ -12488,12 +21830,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v20.16b, v20.16b, v20.16b, #8 \n" "PMULL v15.1q, v20.1d, v24.1d \n" "PMULL2 v20.1q, v20.2d, v24.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" -#else - "EOR v20.16b, v20.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v20.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^3 \n" "PMULL v14.1q, v19.1d, v25.1d \n" "PMULL2 v15.1q, v19.2d, v25.2d \n" @@ -12502,12 +21839,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v19.16b, v19.16b, v19.16b, #8 \n" "PMULL v15.1q, v19.1d, v25.1d \n" "PMULL2 v19.1q, v19.2d, v25.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" -#else - "EOR v19.16b, v19.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v19.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# x[0-2] += C * H^4 \n" "PMULL v14.1q, v18.1d, v26.1d \n" "PMULL2 v15.1q, v18.2d, v26.2d \n" @@ -12516,23 +21848,11 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, "EXT v18.16b, v18.16b, v18.16b, #8 \n" "PMULL v15.1q, v18.1d, v26.1d \n" "PMULL2 v18.1q, v18.2d, v26.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" -#else - "EOR v18.16b, v18.16b, v15.16b \n" - "EOR v31.16b, v31.16b, v18.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "# Reduce X = x[0-2] \n" "EXT v15.16b, v17.16b, v0.16b, #8 \n" "PMULL2 v14.1q, v0.2d, v23.2d \n" -#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" -#else - "EOR v15.16b, v15.16b, v31.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ -#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 - "EOR v15.16b, v15.16b, v14.16b \n" -#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ "PMULL2 v14.1q, v15.2d, v23.2d \n" "MOV v17.D[1], v15.D[0] \n" "EOR v17.16b, v17.16b, v14.16b \n" @@ -12840,6 +22160,7 @@ static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, return ret; } #endif /* WOLFSSL_AES_256 */ +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ /* * Check tag and decrypt data using AES with GCM mode. * aes: Aes structure having already been set with set key function @@ -12857,22 +22178,48 @@ int AES_GCM_decrypt_AARCH64(Aes* aes, byte* out, const byte* in, word32 sz, const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, const byte* authIn, word32 authInSz) { - /* sanity checks */ switch (aes->rounds) { #ifdef WOLFSSL_AES_128 case 10: - return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz); + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + return Aes128GcmDecrypt_EOR3(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); + } + else + #endif + { + return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } #endif #ifdef WOLFSSL_AES_192 case 12: - return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz); + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + return Aes192GcmDecrypt_EOR3(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); + } + else + #endif + { + return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } #endif #ifdef WOLFSSL_AES_256 case 14: - return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, - authTagSz, authIn, authInSz); + #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + if (aes->use_sha3_hw_crypto) { + return Aes256GcmDecrypt_EOR3(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); + } + else + #endif + { + return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz, authTag, + authTagSz, authIn, authInSz); + } #endif } diff --git a/wolfcrypt/src/port/arm/armv8-kyber-asm.S b/wolfcrypt/src/port/arm/armv8-kyber-asm.S index 063f155eee..2a9b4df85c 100644 --- a/wolfcrypt/src/port/arm/armv8-kyber-asm.S +++ b/wolfcrypt/src/port/arm/armv8-kyber-asm.S @@ -263,60 +263,40 @@ _kyber_ntt: mul v30.8h, v14.8h, v1.h[1] sqrdmulh v21.8h, v13.8h, v0.h[1] sqrdmulh v22.8h, v14.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v15.8h, v1.h[1] mul v30.8h, v16.8h, v1.h[1] sqrdmulh v23.8h, v15.8h, v0.h[1] sqrdmulh v24.8h, v16.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[1] mul v30.8h, v18.8h, v1.h[1] sqrdmulh v25.8h, v17.8h, v0.h[1] sqrdmulh v26.8h, v18.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[1] mul v30.8h, v20.8h, v1.h[1] sqrdmulh v27.8h, v19.8h, v0.h[1] sqrdmulh v28.8h, v20.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v13.8h, v5.8h, v21.8h @@ -339,60 +319,40 @@ _kyber_ntt: mul v30.8h, v10.8h, v1.h[2] sqrdmulh v21.8h, v9.8h, v0.h[2] sqrdmulh v22.8h, v10.8h, v0.h[2] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[2] sqrdmulh v23.8h, v11.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[2] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[3] mul v30.8h, v18.8h, v1.h[3] sqrdmulh v25.8h, v17.8h, v0.h[3] sqrdmulh v26.8h, v18.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[3] mul v30.8h, v20.8h, v1.h[3] sqrdmulh v27.8h, v19.8h, v0.h[3] sqrdmulh v28.8h, v20.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v9.8h, v5.8h, v21.8h @@ -415,60 +375,40 @@ _kyber_ntt: mul v30.8h, v8.8h, v1.h[4] sqrdmulh v21.8h, v7.8h, v0.h[4] sqrdmulh v22.8h, v8.8h, v0.h[4] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[5] mul v30.8h, v12.8h, v1.h[5] sqrdmulh v23.8h, v11.8h, v0.h[5] sqrdmulh v24.8h, v12.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v15.8h, v1.h[6] mul v30.8h, v16.8h, v1.h[6] sqrdmulh v25.8h, v15.8h, v0.h[6] sqrdmulh v26.8h, v16.8h, v0.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[7] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v19.8h, v0.h[7] sqrdmulh v28.8h, v20.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v7.8h, v5.8h, v21.8h @@ -493,60 +433,40 @@ _kyber_ntt: mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -603,60 +523,40 @@ _kyber_ntt: mul v30.8h, v14.8h, v1.h[1] sqrdmulh v21.8h, v13.8h, v0.h[1] sqrdmulh v22.8h, v14.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v15.8h, v1.h[1] mul v30.8h, v16.8h, v1.h[1] sqrdmulh v23.8h, v15.8h, v0.h[1] sqrdmulh v24.8h, v16.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[1] mul v30.8h, v18.8h, v1.h[1] sqrdmulh v25.8h, v17.8h, v0.h[1] sqrdmulh v26.8h, v18.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[1] mul v30.8h, v20.8h, v1.h[1] sqrdmulh v27.8h, v19.8h, v0.h[1] sqrdmulh v28.8h, v20.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v13.8h, v5.8h, v21.8h @@ -679,60 +579,40 @@ _kyber_ntt: mul v30.8h, v10.8h, v1.h[2] sqrdmulh v21.8h, v9.8h, v0.h[2] sqrdmulh v22.8h, v10.8h, v0.h[2] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[2] sqrdmulh v23.8h, v11.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[2] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[3] mul v30.8h, v18.8h, v1.h[3] sqrdmulh v25.8h, v17.8h, v0.h[3] sqrdmulh v26.8h, v18.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[3] mul v30.8h, v20.8h, v1.h[3] sqrdmulh v27.8h, v19.8h, v0.h[3] sqrdmulh v28.8h, v20.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v9.8h, v5.8h, v21.8h @@ -755,60 +635,40 @@ _kyber_ntt: mul v30.8h, v8.8h, v1.h[4] sqrdmulh v21.8h, v7.8h, v0.h[4] sqrdmulh v22.8h, v8.8h, v0.h[4] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[5] mul v30.8h, v12.8h, v1.h[5] sqrdmulh v23.8h, v11.8h, v0.h[5] sqrdmulh v24.8h, v12.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v15.8h, v1.h[6] mul v30.8h, v16.8h, v1.h[6] sqrdmulh v25.8h, v15.8h, v0.h[6] sqrdmulh v26.8h, v16.8h, v0.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[7] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v19.8h, v0.h[7] sqrdmulh v28.8h, v20.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v7.8h, v5.8h, v21.8h @@ -833,60 +693,40 @@ _kyber_ntt: mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -935,60 +775,40 @@ _kyber_ntt: mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -1021,15 +841,10 @@ _kyber_ntt: mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #96] @@ -1046,15 +861,10 @@ _kyber_ntt: mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #128] @@ -1071,15 +881,10 @@ _kyber_ntt: mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #160] @@ -1096,15 +901,10 @@ _kyber_ntt: mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -1137,15 +937,10 @@ _kyber_ntt: mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #352] @@ -1162,15 +957,10 @@ _kyber_ntt: mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #384] @@ -1187,15 +977,10 @@ _kyber_ntt: mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #416] @@ -1212,15 +997,10 @@ _kyber_ntt: mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -1357,60 +1137,40 @@ _kyber_ntt: mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -1443,15 +1203,10 @@ _kyber_ntt: mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #224] @@ -1468,15 +1223,10 @@ _kyber_ntt: mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #256] @@ -1493,15 +1243,10 @@ _kyber_ntt: mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #288] @@ -1518,15 +1263,10 @@ _kyber_ntt: mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -1559,15 +1299,10 @@ _kyber_ntt: mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v29.8h, v4.h[0] - sqrdmlsh v22.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #480] @@ -1584,15 +1319,10 @@ _kyber_ntt: mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v29.8h, v4.h[0] - sqrdmlsh v24.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #512] @@ -1609,15 +1339,10 @@ _kyber_ntt: mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v25.8h, v29.8h, v4.h[0] - sqrdmlsh v26.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #544] @@ -1634,15 +1359,10 @@ _kyber_ntt: mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v27.8h, v29.8h, v4.h[0] - sqrdmlsh v28.8h, v30.8h, v4.h[0] -#else sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h @@ -1983,15 +1703,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #32] @@ -2006,15 +1721,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #64] @@ -2029,15 +1739,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #96] @@ -2052,15 +1757,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #256] @@ -2081,15 +1781,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #288] @@ -2110,15 +1805,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #320] @@ -2139,15 +1829,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #352] @@ -2168,15 +1853,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #512] @@ -2195,15 +1875,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[1] sqrdmulh v10.8h, v26.8h, v0.h[0] sqrdmulh v12.8h, v28.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 mov v25.16b, v13.16b @@ -2220,15 +1895,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[3] sqrdmulh v14.8h, v26.8h, v0.h[2] sqrdmulh v16.8h, v28.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 mov v25.16b, v17.16b @@ -2245,15 +1915,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[5] sqrdmulh v18.8h, v26.8h, v0.h[4] sqrdmulh v20.8h, v28.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 mov v25.16b, v21.16b @@ -2270,15 +1935,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[7] sqrdmulh v22.8h, v26.8h, v0.h[6] sqrdmulh v24.8h, v28.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] @@ -2381,15 +2041,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #160] @@ -2404,15 +2059,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #192] @@ -2427,15 +2077,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #224] @@ -2450,15 +2095,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #384] @@ -2479,15 +2119,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #416] @@ -2508,15 +2143,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #448] @@ -2537,15 +2167,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #480] @@ -2566,15 +2191,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #528] @@ -2593,15 +2213,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[1] sqrdmulh v10.8h, v26.8h, v0.h[0] sqrdmulh v12.8h, v28.8h, v0.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 mov v25.16b, v13.16b @@ -2618,15 +2233,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[3] sqrdmulh v14.8h, v26.8h, v0.h[2] sqrdmulh v16.8h, v28.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 mov v25.16b, v17.16b @@ -2643,15 +2253,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[5] sqrdmulh v18.8h, v26.8h, v0.h[4] sqrdmulh v20.8h, v28.8h, v0.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 mov v25.16b, v21.16b @@ -2668,15 +2273,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v2.h[7] sqrdmulh v22.8h, v26.8h, v0.h[6] sqrdmulh v24.8h, v28.8h, v0.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] @@ -2739,15 +2339,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[1] sqrdmulh v10.8h, v26.8h, v4.h[0] sqrdmulh v12.8h, v28.8h, v4.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v14.8h @@ -2758,15 +2353,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[3] sqrdmulh v14.8h, v26.8h, v4.h[2] sqrdmulh v16.8h, v28.8h, v4.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v18.8h @@ -2777,15 +2367,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[5] sqrdmulh v18.8h, v26.8h, v4.h[4] sqrdmulh v20.8h, v28.8h, v4.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v22.8h @@ -2796,15 +2381,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[7] sqrdmulh v22.8h, v26.8h, v4.h[6] sqrdmulh v24.8h, v28.8h, v4.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v11.8h @@ -2815,15 +2395,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[0] sqrdmulh v11.8h, v26.8h, v5.h[0] sqrdmulh v12.8h, v28.8h, v5.h[0] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v11.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v15.8h @@ -2834,15 +2409,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[1] sqrdmulh v15.8h, v26.8h, v5.h[1] sqrdmulh v16.8h, v28.8h, v5.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v19.8h @@ -2853,15 +2423,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[2] sqrdmulh v19.8h, v26.8h, v5.h[2] sqrdmulh v20.8h, v28.8h, v5.h[2] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v19.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v23.8h @@ -2872,15 +2437,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[3] sqrdmulh v23.8h, v26.8h, v5.h[3] sqrdmulh v24.8h, v28.8h, v5.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v13.8h @@ -2891,15 +2451,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[4] sqrdmulh v13.8h, v26.8h, v5.h[4] sqrdmulh v14.8h, v28.8h, v5.h[4] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v13.8h, v25.8h, v8.h[0] - sqrdmlsh v14.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 sub v26.8h, v11.8h, v15.8h @@ -2910,15 +2465,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[4] sqrdmulh v15.8h, v26.8h, v5.h[4] sqrdmulh v16.8h, v28.8h, v5.h[4] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v21.8h @@ -2929,15 +2479,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[5] sqrdmulh v21.8h, v26.8h, v5.h[5] sqrdmulh v22.8h, v28.8h, v5.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v25.8h, v8.h[0] - sqrdmlsh v22.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v19.8h, v23.8h @@ -2948,15 +2493,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[5] sqrdmulh v23.8h, v26.8h, v5.h[5] sqrdmulh v24.8h, v28.8h, v5.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] @@ -2991,15 +2531,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v17.8h, v26.8h, v5.h[6] sqrdmulh v18.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v17.8h, v25.8h, v8.h[0] - sqrdmlsh v18.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 sub v26.8h, v11.8h, v19.8h @@ -3010,15 +2545,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v19.8h, v26.8h, v5.h[6] sqrdmulh v20.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v19.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v13.8h, v21.8h @@ -3029,15 +2559,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v21.8h, v26.8h, v5.h[6] sqrdmulh v22.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v25.8h, v8.h[0] - sqrdmlsh v22.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v15.8h, v23.8h @@ -3048,135 +2573,90 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v23.8h, v26.8h, v5.h[6] sqrdmulh v24.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v25.8h, v9.8h, v7.h[7] mul v26.8h, v10.8h, v7.h[7] sqrdmulh v9.8h, v9.8h, v5.h[7] sqrdmulh v10.8h, v10.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v9.8h, v25.8h, v8.h[0] - sqrdmlsh v10.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v9.8h, v9.8h, v25.8h sub v10.8h, v10.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v25.8h, v11.8h, v7.h[7] mul v26.8h, v12.8h, v7.h[7] sqrdmulh v11.8h, v11.8h, v5.h[7] sqrdmulh v12.8h, v12.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v11.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v25.8h, v13.8h, v7.h[7] mul v26.8h, v14.8h, v7.h[7] sqrdmulh v13.8h, v13.8h, v5.h[7] sqrdmulh v14.8h, v14.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v13.8h, v25.8h, v8.h[0] - sqrdmlsh v14.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v25.8h, v15.8h, v7.h[7] mul v26.8h, v16.8h, v7.h[7] sqrdmulh v15.8h, v15.8h, v5.h[7] sqrdmulh v16.8h, v16.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 mul v25.8h, v17.8h, v7.h[7] mul v26.8h, v18.8h, v7.h[7] sqrdmulh v17.8h, v17.8h, v5.h[7] sqrdmulh v18.8h, v18.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v17.8h, v25.8h, v8.h[0] - sqrdmlsh v18.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 mul v25.8h, v19.8h, v7.h[7] mul v26.8h, v20.8h, v7.h[7] sqrdmulh v19.8h, v19.8h, v5.h[7] sqrdmulh v20.8h, v20.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v19.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 mul v25.8h, v21.8h, v7.h[7] mul v26.8h, v22.8h, v7.h[7] sqrdmulh v21.8h, v21.8h, v5.h[7] sqrdmulh v22.8h, v22.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v25.8h, v8.h[0] - sqrdmlsh v22.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v25.8h, v23.8h, v7.h[7] mul v26.8h, v24.8h, v7.h[7] sqrdmulh v23.8h, v23.8h, v5.h[7] sqrdmulh v24.8h, v24.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 str q9, [x0] @@ -3219,15 +2699,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[1] sqrdmulh v10.8h, v26.8h, v4.h[0] sqrdmulh v12.8h, v28.8h, v4.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v10.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v14.8h @@ -3238,15 +2713,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[3] sqrdmulh v14.8h, v26.8h, v4.h[2] sqrdmulh v16.8h, v28.8h, v4.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v14.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v18.8h @@ -3257,15 +2727,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[5] sqrdmulh v18.8h, v26.8h, v4.h[4] sqrdmulh v20.8h, v28.8h, v4.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v18.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v22.8h @@ -3276,15 +2741,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v6.h[7] sqrdmulh v22.8h, v26.8h, v4.h[6] sqrdmulh v24.8h, v28.8h, v4.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v22.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v11.8h @@ -3295,15 +2755,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[0] sqrdmulh v11.8h, v26.8h, v5.h[0] sqrdmulh v12.8h, v28.8h, v5.h[0] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v11.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v15.8h @@ -3314,15 +2769,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[1] sqrdmulh v15.8h, v26.8h, v5.h[1] sqrdmulh v16.8h, v28.8h, v5.h[1] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v19.8h @@ -3333,15 +2783,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[2] sqrdmulh v19.8h, v26.8h, v5.h[2] sqrdmulh v20.8h, v28.8h, v5.h[2] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v19.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v23.8h @@ -3352,15 +2797,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[3] sqrdmulh v23.8h, v26.8h, v5.h[3] sqrdmulh v24.8h, v28.8h, v5.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v13.8h @@ -3371,15 +2811,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[4] sqrdmulh v13.8h, v26.8h, v5.h[4] sqrdmulh v14.8h, v28.8h, v5.h[4] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v13.8h, v25.8h, v8.h[0] - sqrdmlsh v14.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 sub v26.8h, v11.8h, v15.8h @@ -3390,15 +2825,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[4] sqrdmulh v15.8h, v26.8h, v5.h[4] sqrdmulh v16.8h, v28.8h, v5.h[4] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v25.8h, v8.h[0] - sqrdmlsh v16.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v21.8h @@ -3409,15 +2839,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[5] sqrdmulh v21.8h, v26.8h, v5.h[5] sqrdmulh v22.8h, v28.8h, v5.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v25.8h, v8.h[0] - sqrdmlsh v22.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v19.8h, v23.8h @@ -3428,15 +2853,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[5] sqrdmulh v23.8h, v26.8h, v5.h[5] sqrdmulh v24.8h, v28.8h, v5.h[5] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] @@ -3471,15 +2891,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v17.8h, v26.8h, v5.h[6] sqrdmulh v18.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v17.8h, v25.8h, v8.h[0] - sqrdmlsh v18.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 sub v26.8h, v11.8h, v19.8h @@ -3490,15 +2905,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v19.8h, v26.8h, v5.h[6] sqrdmulh v20.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v19.8h, v25.8h, v8.h[0] - sqrdmlsh v20.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v13.8h, v21.8h @@ -3509,15 +2919,10 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v21.8h, v26.8h, v5.h[6] sqrdmulh v22.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v21.8h, v25.8h, v8.h[0] - sqrdmlsh v22.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v15.8h, v23.8h @@ -3528,135 +2933,2579 @@ _kyber_invntt: mul v27.8h, v28.8h, v7.h[6] sqrdmulh v23.8h, v26.8h, v5.h[6] sqrdmulh v24.8h, v28.8h, v5.h[6] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v23.8h, v25.8h, v8.h[0] - sqrdmlsh v24.8h, v27.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v25.8h, v9.8h, v7.h[7] mul v26.8h, v10.8h, v7.h[7] sqrdmulh v9.8h, v9.8h, v5.h[7] sqrdmulh v10.8h, v10.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v9.8h, v25.8h, v8.h[0] - sqrdmlsh v10.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v9.8h, v9.8h, v25.8h sub v10.8h, v10.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v25.8h, v11.8h, v7.h[7] mul v26.8h, v12.8h, v7.h[7] sqrdmulh v11.8h, v11.8h, v5.h[7] sqrdmulh v12.8h, v12.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v11.8h, v25.8h, v8.h[0] - sqrdmlsh v12.8h, v26.8h, v8.h[0] -#else sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v25.8h, v13.8h, v7.h[7] mul v26.8h, v14.8h, v7.h[7] sqrdmulh v13.8h, v13.8h, v5.h[7] sqrdmulh v14.8h, v14.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v13.8h, v13.8h, v25.8h + sub v14.8h, v14.8h, v26.8h + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v25.8h, v15.8h, v7.h[7] + mul v26.8h, v16.8h, v7.h[7] + sqrdmulh v15.8h, v15.8h, v5.h[7] + sqrdmulh v16.8h, v16.8h, v5.h[7] + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v15.8h, v15.8h, v25.8h + sub v16.8h, v16.8h, v26.8h + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + mul v25.8h, v17.8h, v7.h[7] + mul v26.8h, v18.8h, v7.h[7] + sqrdmulh v17.8h, v17.8h, v5.h[7] + sqrdmulh v18.8h, v18.8h, v5.h[7] + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v17.8h, v17.8h, v25.8h + sub v18.8h, v18.8h, v26.8h + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + mul v25.8h, v19.8h, v7.h[7] + mul v26.8h, v20.8h, v7.h[7] + sqrdmulh v19.8h, v19.8h, v5.h[7] + sqrdmulh v20.8h, v20.8h, v5.h[7] + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v19.8h, v19.8h, v25.8h + sub v20.8h, v20.8h, v26.8h + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + mul v25.8h, v21.8h, v7.h[7] + mul v26.8h, v22.8h, v7.h[7] + sqrdmulh v21.8h, v21.8h, v5.h[7] + sqrdmulh v22.8h, v22.8h, v5.h[7] + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v21.8h, v21.8h, v25.8h + sub v22.8h, v22.8h, v26.8h + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v25.8h, v23.8h, v7.h[7] + mul v26.8h, v24.8h, v7.h[7] + sqrdmulh v23.8h, v23.8h, v5.h[7] + sqrdmulh v24.8h, v24.8h, v5.h[7] + sqrdmulh v25.8h, v25.8h, v8.h[0] + sqrdmulh v26.8h, v26.8h, v8.h[0] + sub v23.8h, v23.8h, v25.8h + sub v24.8h, v24.8h, v26.8h + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + str q9, [x0, #16] + str q10, [x0, #48] + str q11, [x0, #80] + str q12, [x0, #112] + str q13, [x0, #144] + str q14, [x0, #176] + str q15, [x0, #208] + str q16, [x0, #240] + str q17, [x1, #16] + str q18, [x1, #48] + str q19, [x1, #80] + str q20, [x1, #112] + str q21, [x1, #144] + str q22, [x1, #176] + str q23, [x1, #208] + str q24, [x1, #240] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_invntt,.-kyber_invntt +#endif /* __APPLE__ */ +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH +#ifndef __APPLE__ +.text +.globl kyber_ntt_sqrdmlsh +.type kyber_ntt_sqrdmlsh,@function +.align 2 +kyber_ntt_sqrdmlsh: +#else +.section __TEXT,__text +.globl _kyber_ntt_sqrdmlsh +.p2align 2 +_kyber_ntt_sqrdmlsh: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_zetas + add x2, x2, :lo12:L_kyber_aarch64_zetas +#else + adrp x2, L_kyber_aarch64_zetas@PAGE + add x2, x2, :lo12:L_kyber_aarch64_zetas@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_qinv + add x3, x3, :lo12:L_kyber_aarch64_zetas_qinv +#else + adrp x3, L_kyber_aarch64_zetas_qinv@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_qinv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + add x1, x0, #0x100 + ldr q4, [x4] + ldr q5, [x0] + ldr q6, [x0, #32] + ldr q7, [x0, #64] + ldr q8, [x0, #96] + ldr q9, [x0, #128] + ldr q10, [x0, #160] + ldr q11, [x0, #192] + ldr q12, [x0, #224] + ldr q13, [x1] + ldr q14, [x1, #32] + ldr q15, [x1, #64] + ldr q16, [x1, #96] + ldr q17, [x1, #128] + ldr q18, [x1, #160] + ldr q19, [x1, #192] + ldr q20, [x1, #224] + ldr q0, [x2] + ldr q1, [x3] + mul v29.8h, v13.8h, v1.h[1] + mul v30.8h, v14.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v22.8h, v14.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v15.8h, v1.h[1] + mul v30.8h, v16.8h, v1.h[1] + sqrdmulh v23.8h, v15.8h, v0.h[1] + sqrdmulh v24.8h, v16.8h, v0.h[1] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[1] + mul v30.8h, v18.8h, v1.h[1] + sqrdmulh v25.8h, v17.8h, v0.h[1] + sqrdmulh v26.8h, v18.8h, v0.h[1] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[1] + mul v30.8h, v20.8h, v1.h[1] + sqrdmulh v27.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v0.h[1] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v13.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v14.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v15.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v16.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v9.8h, v25.8h + add v9.8h, v9.8h, v25.8h + sub v18.8h, v10.8h, v26.8h + add v10.8h, v10.8h, v26.8h + sub v19.8h, v11.8h, v27.8h + add v11.8h, v11.8h, v27.8h + sub v20.8h, v12.8h, v28.8h + add v12.8h, v12.8h, v28.8h + mul v29.8h, v9.8h, v1.h[2] + mul v30.8h, v10.8h, v1.h[2] + sqrdmulh v21.8h, v9.8h, v0.h[2] + sqrdmulh v22.8h, v10.8h, v0.h[2] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[2] + sqrdmulh v23.8h, v11.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[2] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[3] + mul v30.8h, v18.8h, v1.h[3] + sqrdmulh v25.8h, v17.8h, v0.h[3] + sqrdmulh v26.8h, v18.8h, v0.h[3] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[3] + mul v30.8h, v20.8h, v1.h[3] + sqrdmulh v27.8h, v19.8h, v0.h[3] + sqrdmulh v28.8h, v20.8h, v0.h[3] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v9.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v10.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v12.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v18.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v15.8h, v27.8h + add v15.8h, v15.8h, v27.8h + sub v20.8h, v16.8h, v28.8h + add v16.8h, v16.8h, v28.8h + mul v29.8h, v7.8h, v1.h[4] + mul v30.8h, v8.8h, v1.h[4] + sqrdmulh v21.8h, v7.8h, v0.h[4] + sqrdmulh v22.8h, v8.8h, v0.h[4] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[5] + mul v30.8h, v12.8h, v1.h[5] + sqrdmulh v23.8h, v11.8h, v0.h[5] + sqrdmulh v24.8h, v12.8h, v0.h[5] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v15.8h, v1.h[6] + mul v30.8h, v16.8h, v1.h[6] + sqrdmulh v25.8h, v15.8h, v0.h[6] + sqrdmulh v26.8h, v16.8h, v0.h[6] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[7] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v19.8h, v0.h[7] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v7.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + sub v15.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v18.8h, v28.8h + add v18.8h, v18.8h, v28.8h + ldr q0, [x2, #16] + ldr q1, [x3, #16] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + str q5, [x0] + str q6, [x0, #32] + str q7, [x0, #64] + str q8, [x0, #96] + str q9, [x0, #128] + str q10, [x0, #160] + str q11, [x0, #192] + str q12, [x0, #224] + str q13, [x1] + str q14, [x1, #32] + str q15, [x1, #64] + str q16, [x1, #96] + str q17, [x1, #128] + str q18, [x1, #160] + str q19, [x1, #192] + str q20, [x1, #224] + ldr q5, [x0, #16] + ldr q6, [x0, #48] + ldr q7, [x0, #80] + ldr q8, [x0, #112] + ldr q9, [x0, #144] + ldr q10, [x0, #176] + ldr q11, [x0, #208] + ldr q12, [x0, #240] + ldr q13, [x1, #16] + ldr q14, [x1, #48] + ldr q15, [x1, #80] + ldr q16, [x1, #112] + ldr q17, [x1, #144] + ldr q18, [x1, #176] + ldr q19, [x1, #208] + ldr q20, [x1, #240] + ldr q0, [x2] + ldr q1, [x3] + mul v29.8h, v13.8h, v1.h[1] + mul v30.8h, v14.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v22.8h, v14.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v15.8h, v1.h[1] + mul v30.8h, v16.8h, v1.h[1] + sqrdmulh v23.8h, v15.8h, v0.h[1] + sqrdmulh v24.8h, v16.8h, v0.h[1] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[1] + mul v30.8h, v18.8h, v1.h[1] + sqrdmulh v25.8h, v17.8h, v0.h[1] + sqrdmulh v26.8h, v18.8h, v0.h[1] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[1] + mul v30.8h, v20.8h, v1.h[1] + sqrdmulh v27.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v0.h[1] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v13.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v14.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v15.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v16.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v9.8h, v25.8h + add v9.8h, v9.8h, v25.8h + sub v18.8h, v10.8h, v26.8h + add v10.8h, v10.8h, v26.8h + sub v19.8h, v11.8h, v27.8h + add v11.8h, v11.8h, v27.8h + sub v20.8h, v12.8h, v28.8h + add v12.8h, v12.8h, v28.8h + mul v29.8h, v9.8h, v1.h[2] + mul v30.8h, v10.8h, v1.h[2] + sqrdmulh v21.8h, v9.8h, v0.h[2] + sqrdmulh v22.8h, v10.8h, v0.h[2] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[2] + sqrdmulh v23.8h, v11.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[2] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v17.8h, v1.h[3] + mul v30.8h, v18.8h, v1.h[3] + sqrdmulh v25.8h, v17.8h, v0.h[3] + sqrdmulh v26.8h, v18.8h, v0.h[3] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[3] + mul v30.8h, v20.8h, v1.h[3] + sqrdmulh v27.8h, v19.8h, v0.h[3] + sqrdmulh v28.8h, v20.8h, v0.h[3] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v9.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v10.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v7.8h, v23.8h + add v7.8h, v7.8h, v23.8h + sub v12.8h, v8.8h, v24.8h + add v8.8h, v8.8h, v24.8h + sub v17.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v18.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v15.8h, v27.8h + add v15.8h, v15.8h, v27.8h + sub v20.8h, v16.8h, v28.8h + add v16.8h, v16.8h, v28.8h + mul v29.8h, v7.8h, v1.h[4] + mul v30.8h, v8.8h, v1.h[4] + sqrdmulh v21.8h, v7.8h, v0.h[4] + sqrdmulh v22.8h, v8.8h, v0.h[4] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v11.8h, v1.h[5] + mul v30.8h, v12.8h, v1.h[5] + sqrdmulh v23.8h, v11.8h, v0.h[5] + sqrdmulh v24.8h, v12.8h, v0.h[5] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v15.8h, v1.h[6] + mul v30.8h, v16.8h, v1.h[6] + sqrdmulh v25.8h, v15.8h, v0.h[6] + sqrdmulh v26.8h, v16.8h, v0.h[6] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v19.8h, v1.h[7] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v19.8h, v0.h[7] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v7.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v6.8h, v22.8h + add v6.8h, v6.8h, v22.8h + sub v11.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v10.8h, v24.8h + add v10.8h, v10.8h, v24.8h + sub v15.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v14.8h, v26.8h + add v14.8h, v14.8h, v26.8h + sub v19.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v18.8h, v28.8h + add v18.8h, v18.8h, v28.8h + ldr q0, [x2, #16] + ldr q1, [x3, #16] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + str q5, [x0, #16] + str q6, [x0, #48] + str q7, [x0, #80] + str q8, [x0, #112] + str q9, [x0, #144] + str q10, [x0, #176] + str q11, [x0, #208] + str q12, [x0, #240] + str q13, [x1, #16] + str q14, [x1, #48] + str q15, [x1, #80] + str q16, [x1, #112] + str q17, [x1, #144] + str q18, [x1, #176] + str q19, [x1, #208] + str q20, [x1, #240] + ldp q5, q6, [x0] + ldp q7, q8, [x0, #32] + ldp q9, q10, [x0, #64] + ldp q11, q12, [x0, #96] + ldp q13, q14, [x0, #128] + ldp q15, q16, [x0, #160] + ldp q17, q18, [x0, #192] + ldp q19, q20, [x0, #224] + ldr q0, [x2, #32] + ldr q1, [x3, #32] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #64] + ldr q2, [x2, #80] + ldr q1, [x3, #64] + ldr q3, [x3, #80] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.2d, v5.2d, v6.2d + trn1 v7.2d, v7.2d, v8.2d + trn2 v6.2d, v29.2d, v6.2d + trn2 v8.2d, v30.2d, v8.2d + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #96] + ldr q2, [x2, #112] + ldr q1, [x3, #96] + ldr q3, [x3, #112] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v29.2d, v10.2d + trn2 v12.2d, v30.2d, v12.2d + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #128] + ldr q2, [x2, #144] + ldr q1, [x3, #128] + ldr q3, [x3, #144] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v29.2d, v14.2d + trn2 v16.2d, v30.2d, v16.2d + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #160] + ldr q2, [x2, #176] + ldr q1, [x3, #160] + ldr q3, [x3, #176] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v29.2d, v18.2d + trn2 v20.2d, v30.2d, v20.2d + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #320] + ldr q2, [x2, #336] + ldr q1, [x3, #320] + ldr q3, [x3, #336] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.4s, v5.4s, v6.4s + trn1 v7.4s, v7.4s, v8.4s + trn2 v6.4s, v29.4s, v6.4s + trn2 v8.4s, v30.4s, v8.4s + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #352] + ldr q2, [x2, #368] + ldr q1, [x3, #352] + ldr q3, [x3, #368] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v29.4s, v10.4s + trn2 v12.4s, v30.4s, v12.4s + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #384] + ldr q2, [x2, #400] + ldr q1, [x3, #384] + ldr q3, [x3, #400] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v29.4s, v14.4s + trn2 v16.4s, v30.4s, v16.4s + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #416] + ldr q2, [x2, #432] + ldr q1, [x3, #416] + ldr q3, [x3, #432] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v29.4s, v18.4s + trn2 v20.4s, v30.4s, v20.4s + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + sqdmulh v21.8h, v5.8h, v4.h[2] + sqdmulh v22.8h, v6.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v5.8h, v21.8h, v4.h[0] + mls v6.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v7.8h, v4.h[2] + sqdmulh v22.8h, v8.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v7.8h, v21.8h, v4.h[0] + mls v8.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v9.8h, v4.h[2] + sqdmulh v22.8h, v10.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v9.8h, v21.8h, v4.h[0] + mls v10.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v11.8h, v4.h[2] + sqdmulh v22.8h, v12.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v11.8h, v21.8h, v4.h[0] + mls v12.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v13.8h, v4.h[2] + sqdmulh v22.8h, v14.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v13.8h, v21.8h, v4.h[0] + mls v14.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v15.8h, v4.h[2] + sqdmulh v22.8h, v16.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v15.8h, v21.8h, v4.h[0] + mls v16.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v17.8h, v4.h[2] + sqdmulh v22.8h, v18.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v17.8h, v21.8h, v4.h[0] + mls v18.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v19.8h, v4.h[2] + sqdmulh v22.8h, v20.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v19.8h, v21.8h, v4.h[0] + mls v20.8h, v22.8h, v4.h[0] + mov v29.16b, v5.16b + trn1 v5.4s, v5.4s, v6.4s + trn2 v6.4s, v29.4s, v6.4s + mov v29.16b, v5.16b + trn1 v5.2d, v5.2d, v6.2d + trn2 v6.2d, v29.2d, v6.2d + mov v29.16b, v7.16b + trn1 v7.4s, v7.4s, v8.4s + trn2 v8.4s, v29.4s, v8.4s + mov v29.16b, v7.16b + trn1 v7.2d, v7.2d, v8.2d + trn2 v8.2d, v29.2d, v8.2d + mov v29.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v29.4s, v10.4s + mov v29.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v29.2d, v10.2d + mov v29.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v29.4s, v12.4s + mov v29.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v29.2d, v12.2d + mov v29.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v29.4s, v14.4s + mov v29.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v29.2d, v14.2d + mov v29.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v29.4s, v16.4s + mov v29.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v29.2d, v16.2d + mov v29.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v29.4s, v18.4s + mov v29.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v29.2d, v18.2d + mov v29.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v29.4s, v20.4s + mov v29.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v29.2d, v20.2d + stp q5, q6, [x0] + stp q7, q8, [x0, #32] + stp q9, q10, [x0, #64] + stp q11, q12, [x0, #96] + stp q13, q14, [x0, #128] + stp q15, q16, [x0, #160] + stp q17, q18, [x0, #192] + stp q19, q20, [x0, #224] + ldp q5, q6, [x1] + ldp q7, q8, [x1, #32] + ldp q9, q10, [x1, #64] + ldp q11, q12, [x1, #96] + ldp q13, q14, [x1, #128] + ldp q15, q16, [x1, #160] + ldp q17, q18, [x1, #192] + ldp q19, q20, [x1, #224] + ldr q0, [x2, #48] + ldr q1, [x3, #48] + mul v29.8h, v6.8h, v1.h[0] + mul v30.8h, v8.8h, v1.h[1] + sqrdmulh v21.8h, v6.8h, v0.h[0] + sqrdmulh v22.8h, v8.8h, v0.h[1] + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v29.8h, v10.8h, v1.h[2] + mul v30.8h, v12.8h, v1.h[3] + sqrdmulh v23.8h, v10.8h, v0.h[2] + sqrdmulh v24.8h, v12.8h, v0.h[3] + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v29.8h, v14.8h, v1.h[4] + mul v30.8h, v16.8h, v1.h[5] + sqrdmulh v25.8h, v14.8h, v0.h[4] + sqrdmulh v26.8h, v16.8h, v0.h[5] + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + mul v29.8h, v18.8h, v1.h[6] + mul v30.8h, v20.8h, v1.h[7] + sqrdmulh v27.8h, v18.8h, v0.h[6] + sqrdmulh v28.8h, v20.8h, v0.h[7] + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #192] + ldr q2, [x2, #208] + ldr q1, [x3, #192] + ldr q3, [x3, #208] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.2d, v5.2d, v6.2d + trn1 v7.2d, v7.2d, v8.2d + trn2 v6.2d, v29.2d, v6.2d + trn2 v8.2d, v30.2d, v8.2d + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #224] + ldr q2, [x2, #240] + ldr q1, [x3, #224] + ldr q3, [x3, #240] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v29.2d, v10.2d + trn2 v12.2d, v30.2d, v12.2d + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #256] + ldr q2, [x2, #272] + ldr q1, [x3, #256] + ldr q3, [x3, #272] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v29.2d, v14.2d + trn2 v16.2d, v30.2d, v16.2d + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #288] + ldr q2, [x2, #304] + ldr q1, [x3, #288] + ldr q3, [x3, #304] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v29.2d, v18.2d + trn2 v20.2d, v30.2d, v20.2d + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + ldr q0, [x2, #448] + ldr q2, [x2, #464] + ldr q1, [x3, #448] + ldr q3, [x3, #464] + mov v29.16b, v5.16b + mov v30.16b, v7.16b + trn1 v5.4s, v5.4s, v6.4s + trn1 v7.4s, v7.4s, v8.4s + trn2 v6.4s, v29.4s, v6.4s + trn2 v8.4s, v30.4s, v8.4s + mul v29.8h, v6.8h, v1.8h + mul v30.8h, v8.8h, v3.8h + sqrdmulh v21.8h, v6.8h, v0.8h + sqrdmulh v22.8h, v8.8h, v2.8h + sqrdmlsh v21.8h, v29.8h, v4.h[0] + sqrdmlsh v22.8h, v30.8h, v4.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + ldr q0, [x2, #480] + ldr q2, [x2, #496] + ldr q1, [x3, #480] + ldr q3, [x3, #496] + mov v29.16b, v9.16b + mov v30.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v29.4s, v10.4s + trn2 v12.4s, v30.4s, v12.4s + mul v29.8h, v10.8h, v1.8h + mul v30.8h, v12.8h, v3.8h + sqrdmulh v23.8h, v10.8h, v0.8h + sqrdmulh v24.8h, v12.8h, v2.8h + sqrdmlsh v23.8h, v29.8h, v4.h[0] + sqrdmlsh v24.8h, v30.8h, v4.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #512] + ldr q2, [x2, #528] + ldr q1, [x3, #512] + ldr q3, [x3, #528] + mov v29.16b, v13.16b + mov v30.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v29.4s, v14.4s + trn2 v16.4s, v30.4s, v16.4s + mul v29.8h, v14.8h, v1.8h + mul v30.8h, v16.8h, v3.8h + sqrdmulh v25.8h, v14.8h, v0.8h + sqrdmulh v26.8h, v16.8h, v2.8h + sqrdmlsh v25.8h, v29.8h, v4.h[0] + sqrdmlsh v26.8h, v30.8h, v4.h[0] + sshr v25.8h, v25.8h, #1 + sshr v26.8h, v26.8h, #1 + ldr q0, [x2, #544] + ldr q2, [x2, #560] + ldr q1, [x3, #544] + ldr q3, [x3, #560] + mov v29.16b, v17.16b + mov v30.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v29.4s, v18.4s + trn2 v20.4s, v30.4s, v20.4s + mul v29.8h, v18.8h, v1.8h + mul v30.8h, v20.8h, v3.8h + sqrdmulh v27.8h, v18.8h, v0.8h + sqrdmulh v28.8h, v20.8h, v2.8h + sqrdmlsh v27.8h, v29.8h, v4.h[0] + sqrdmlsh v28.8h, v30.8h, v4.h[0] + sshr v27.8h, v27.8h, #1 + sshr v28.8h, v28.8h, #1 + sub v6.8h, v5.8h, v21.8h + add v5.8h, v5.8h, v21.8h + sub v8.8h, v7.8h, v22.8h + add v7.8h, v7.8h, v22.8h + sub v10.8h, v9.8h, v23.8h + add v9.8h, v9.8h, v23.8h + sub v12.8h, v11.8h, v24.8h + add v11.8h, v11.8h, v24.8h + sub v14.8h, v13.8h, v25.8h + add v13.8h, v13.8h, v25.8h + sub v16.8h, v15.8h, v26.8h + add v15.8h, v15.8h, v26.8h + sub v18.8h, v17.8h, v27.8h + add v17.8h, v17.8h, v27.8h + sub v20.8h, v19.8h, v28.8h + add v19.8h, v19.8h, v28.8h + sqdmulh v21.8h, v5.8h, v4.h[2] + sqdmulh v22.8h, v6.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v5.8h, v21.8h, v4.h[0] + mls v6.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v7.8h, v4.h[2] + sqdmulh v22.8h, v8.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v7.8h, v21.8h, v4.h[0] + mls v8.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v9.8h, v4.h[2] + sqdmulh v22.8h, v10.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v9.8h, v21.8h, v4.h[0] + mls v10.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v11.8h, v4.h[2] + sqdmulh v22.8h, v12.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v11.8h, v21.8h, v4.h[0] + mls v12.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v13.8h, v4.h[2] + sqdmulh v22.8h, v14.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v13.8h, v21.8h, v4.h[0] + mls v14.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v15.8h, v4.h[2] + sqdmulh v22.8h, v16.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v15.8h, v21.8h, v4.h[0] + mls v16.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v17.8h, v4.h[2] + sqdmulh v22.8h, v18.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v17.8h, v21.8h, v4.h[0] + mls v18.8h, v22.8h, v4.h[0] + sqdmulh v21.8h, v19.8h, v4.h[2] + sqdmulh v22.8h, v20.8h, v4.h[2] + sshr v21.8h, v21.8h, #11 + sshr v22.8h, v22.8h, #11 + mls v19.8h, v21.8h, v4.h[0] + mls v20.8h, v22.8h, v4.h[0] + mov v29.16b, v5.16b + trn1 v5.4s, v5.4s, v6.4s + trn2 v6.4s, v29.4s, v6.4s + mov v29.16b, v5.16b + trn1 v5.2d, v5.2d, v6.2d + trn2 v6.2d, v29.2d, v6.2d + mov v29.16b, v7.16b + trn1 v7.4s, v7.4s, v8.4s + trn2 v8.4s, v29.4s, v8.4s + mov v29.16b, v7.16b + trn1 v7.2d, v7.2d, v8.2d + trn2 v8.2d, v29.2d, v8.2d + mov v29.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v29.4s, v10.4s + mov v29.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v29.2d, v10.2d + mov v29.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v29.4s, v12.4s + mov v29.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v29.2d, v12.2d + mov v29.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v29.4s, v14.4s + mov v29.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v29.2d, v14.2d + mov v29.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v29.4s, v16.4s + mov v29.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v29.2d, v16.2d + mov v29.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v29.4s, v18.4s + mov v29.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v29.2d, v18.2d + mov v29.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v29.4s, v20.4s + mov v29.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v29.2d, v20.2d + stp q5, q6, [x1] + stp q7, q8, [x1, #32] + stp q9, q10, [x1, #64] + stp q11, q12, [x1, #96] + stp q13, q14, [x1, #128] + stp q15, q16, [x1, #160] + stp q17, q18, [x1, #192] + stp q19, q20, [x1, #224] + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_ntt_sqrdmlsh,.-kyber_ntt_sqrdmlsh +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl kyber_invntt_sqrdmlsh +.type kyber_invntt_sqrdmlsh,@function +.align 2 +kyber_invntt_sqrdmlsh: +#else +.section __TEXT,__text +.globl _kyber_invntt_sqrdmlsh +.p2align 2 +_kyber_invntt_sqrdmlsh: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x2, L_kyber_aarch64_zetas_inv + add x2, x2, :lo12:L_kyber_aarch64_zetas_inv +#else + adrp x2, L_kyber_aarch64_zetas_inv@PAGE + add x2, x2, :lo12:L_kyber_aarch64_zetas_inv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x3, L_kyber_aarch64_zetas_inv_qinv + add x3, x3, :lo12:L_kyber_aarch64_zetas_inv_qinv +#else + adrp x3, L_kyber_aarch64_zetas_inv_qinv@PAGE + add x3, x3, :lo12:L_kyber_aarch64_zetas_inv_qinv@PAGEOFF +#endif /* __APPLE__ */ +#ifndef __APPLE__ + adrp x4, L_kyber_aarch64_consts + add x4, x4, :lo12:L_kyber_aarch64_consts +#else + adrp x4, L_kyber_aarch64_consts@PAGE + add x4, x4, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + add x1, x0, #0x100 + ldr q8, [x4] + ldp q9, q10, [x0] + ldp q11, q12, [x0, #32] + ldp q13, q14, [x0, #64] + ldp q15, q16, [x0, #96] + ldp q17, q18, [x0, #128] + ldp q19, q20, [x0, #160] + ldp q21, q22, [x0, #192] + ldp q23, q24, [x0, #224] + mov v25.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v25.2d, v10.2d + mov v25.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v25.4s, v10.4s + mov v25.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v25.2d, v12.2d + mov v25.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v25.4s, v12.4s + mov v25.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v25.2d, v14.2d + mov v25.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v25.4s, v14.4s + mov v25.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v25.2d, v16.2d + mov v25.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v25.4s, v16.4s + mov v25.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v25.2d, v18.2d + mov v25.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v25.4s, v18.4s + mov v25.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v25.2d, v20.2d + mov v25.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v25.4s, v20.4s + mov v25.16b, v21.16b + trn1 v21.2d, v21.2d, v22.2d + trn2 v22.2d, v25.2d, v22.2d + mov v25.16b, v21.16b + trn1 v21.4s, v21.4s, v22.4s + trn2 v22.4s, v25.4s, v22.4s + mov v25.16b, v23.16b + trn1 v23.2d, v23.2d, v24.2d + trn2 v24.2d, v25.2d, v24.2d + mov v25.16b, v23.16b + trn1 v23.4s, v23.4s, v24.4s + trn2 v24.4s, v25.4s, v24.4s + ldr q0, [x2] + ldr q1, [x2, #16] + ldr q2, [x3] + ldr q3, [x3, #16] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #32] + ldr q1, [x2, #48] + ldr q2, [x3, #32] + ldr q3, [x3, #48] + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #64] + ldr q1, [x2, #80] + ldr q2, [x3, #64] + ldr q3, [x3, #80] + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #96] + ldr q1, [x2, #112] + ldr q2, [x3, #96] + ldr q3, [x3, #112] + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #256] + ldr q1, [x2, #272] + ldr q2, [x3, #256] + ldr q3, [x3, #272] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v25.4s, v10.4s + trn2 v12.4s, v26.4s, v12.4s + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #288] + ldr q1, [x2, #304] + ldr q2, [x3, #288] + ldr q3, [x3, #304] + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v25.4s, v14.4s + trn2 v16.4s, v26.4s, v16.4s + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #320] + ldr q1, [x2, #336] + ldr q2, [x3, #320] + ldr q3, [x3, #336] + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v25.4s, v18.4s + trn2 v20.4s, v26.4s, v20.4s + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #352] + ldr q1, [x2, #368] + ldr q2, [x3, #352] + ldr q3, [x3, #368] + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.4s, v21.4s, v22.4s + trn1 v23.4s, v23.4s, v24.4s + trn2 v22.4s, v25.4s, v22.4s + trn2 v24.4s, v26.4s, v24.4s + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #512] + ldr q2, [x3, #512] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v25.2d, v10.2d + trn2 v12.2d, v26.2d, v12.2d + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.h[0] + mul v27.8h, v28.8h, v2.h[1] + sqrdmulh v10.8h, v26.8h, v0.h[0] + sqrdmulh v12.8h, v28.8h, v0.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v25.2d, v14.2d + trn2 v16.2d, v26.2d, v16.2d + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.h[2] + mul v27.8h, v28.8h, v2.h[3] + sqrdmulh v14.8h, v26.8h, v0.h[2] + sqrdmulh v16.8h, v28.8h, v0.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v25.2d, v18.2d + trn2 v20.2d, v26.2d, v20.2d + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.h[4] + mul v27.8h, v28.8h, v2.h[5] + sqrdmulh v18.8h, v26.8h, v0.h[4] + sqrdmulh v20.8h, v28.8h, v0.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.2d, v21.2d, v22.2d + trn1 v23.2d, v23.2d, v24.2d + trn2 v22.2d, v25.2d, v22.2d + trn2 v24.2d, v26.2d, v24.2d + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.h[6] + mul v27.8h, v28.8h, v2.h[7] + sqrdmulh v22.8h, v26.8h, v0.h[6] + sqrdmulh v24.8h, v28.8h, v0.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v11.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v11.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v13.8h, v8.h[2] + sqdmulh v26.8h, v15.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v13.8h, v25.8h, v8.h[0] + mls v15.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v19.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v19.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v21.8h, v8.h[2] + sqdmulh v26.8h, v23.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v21.8h, v25.8h, v8.h[0] + mls v23.8h, v26.8h, v8.h[0] + stp q9, q10, [x0] + stp q11, q12, [x0, #32] + stp q13, q14, [x0, #64] + stp q15, q16, [x0, #96] + stp q17, q18, [x0, #128] + stp q19, q20, [x0, #160] + stp q21, q22, [x0, #192] + stp q23, q24, [x0, #224] + ldp q9, q10, [x1] + ldp q11, q12, [x1, #32] + ldp q13, q14, [x1, #64] + ldp q15, q16, [x1, #96] + ldp q17, q18, [x1, #128] + ldp q19, q20, [x1, #160] + ldp q21, q22, [x1, #192] + ldp q23, q24, [x1, #224] + mov v25.16b, v9.16b + trn1 v9.2d, v9.2d, v10.2d + trn2 v10.2d, v25.2d, v10.2d + mov v25.16b, v9.16b + trn1 v9.4s, v9.4s, v10.4s + trn2 v10.4s, v25.4s, v10.4s + mov v25.16b, v11.16b + trn1 v11.2d, v11.2d, v12.2d + trn2 v12.2d, v25.2d, v12.2d + mov v25.16b, v11.16b + trn1 v11.4s, v11.4s, v12.4s + trn2 v12.4s, v25.4s, v12.4s + mov v25.16b, v13.16b + trn1 v13.2d, v13.2d, v14.2d + trn2 v14.2d, v25.2d, v14.2d + mov v25.16b, v13.16b + trn1 v13.4s, v13.4s, v14.4s + trn2 v14.4s, v25.4s, v14.4s + mov v25.16b, v15.16b + trn1 v15.2d, v15.2d, v16.2d + trn2 v16.2d, v25.2d, v16.2d + mov v25.16b, v15.16b + trn1 v15.4s, v15.4s, v16.4s + trn2 v16.4s, v25.4s, v16.4s + mov v25.16b, v17.16b + trn1 v17.2d, v17.2d, v18.2d + trn2 v18.2d, v25.2d, v18.2d + mov v25.16b, v17.16b + trn1 v17.4s, v17.4s, v18.4s + trn2 v18.4s, v25.4s, v18.4s + mov v25.16b, v19.16b + trn1 v19.2d, v19.2d, v20.2d + trn2 v20.2d, v25.2d, v20.2d + mov v25.16b, v19.16b + trn1 v19.4s, v19.4s, v20.4s + trn2 v20.4s, v25.4s, v20.4s + mov v25.16b, v21.16b + trn1 v21.2d, v21.2d, v22.2d + trn2 v22.2d, v25.2d, v22.2d + mov v25.16b, v21.16b + trn1 v21.4s, v21.4s, v22.4s + trn2 v22.4s, v25.4s, v22.4s + mov v25.16b, v23.16b + trn1 v23.2d, v23.2d, v24.2d + trn2 v24.2d, v25.2d, v24.2d + mov v25.16b, v23.16b + trn1 v23.4s, v23.4s, v24.4s + trn2 v24.4s, v25.4s, v24.4s + ldr q0, [x2, #128] + ldr q1, [x2, #144] + ldr q2, [x3, #128] + ldr q3, [x3, #144] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #160] + ldr q1, [x2, #176] + ldr q2, [x3, #160] + ldr q3, [x3, #176] + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #192] + ldr q1, [x2, #208] + ldr q2, [x3, #192] + ldr q3, [x3, #208] + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #224] + ldr q1, [x2, #240] + ldr q2, [x3, #224] + ldr q3, [x3, #240] + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #384] + ldr q1, [x2, #400] + ldr q2, [x3, #384] + ldr q3, [x3, #400] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.4s, v9.4s, v10.4s + trn1 v11.4s, v11.4s, v12.4s + trn2 v10.4s, v25.4s, v10.4s + trn2 v12.4s, v26.4s, v12.4s + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v10.8h, v26.8h, v0.8h + sqrdmulh v12.8h, v28.8h, v1.8h + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + ldr q0, [x2, #416] + ldr q1, [x2, #432] + ldr q2, [x3, #416] + ldr q3, [x3, #432] + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.4s, v13.4s, v14.4s + trn1 v15.4s, v15.4s, v16.4s + trn2 v14.4s, v25.4s, v14.4s + trn2 v16.4s, v26.4s, v16.4s + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v14.8h, v26.8h, v0.8h + sqrdmulh v16.8h, v28.8h, v1.8h + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + ldr q0, [x2, #448] + ldr q1, [x2, #464] + ldr q2, [x3, #448] + ldr q3, [x3, #464] + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.4s, v17.4s, v18.4s + trn1 v19.4s, v19.4s, v20.4s + trn2 v18.4s, v25.4s, v18.4s + trn2 v20.4s, v26.4s, v20.4s + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v18.8h, v26.8h, v0.8h + sqrdmulh v20.8h, v28.8h, v1.8h + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + ldr q0, [x2, #480] + ldr q1, [x2, #496] + ldr q2, [x3, #480] + ldr q3, [x3, #496] + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.4s, v21.4s, v22.4s + trn1 v23.4s, v23.4s, v24.4s + trn2 v22.4s, v25.4s, v22.4s + trn2 v24.4s, v26.4s, v24.4s + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.8h + mul v27.8h, v28.8h, v3.8h + sqrdmulh v22.8h, v26.8h, v0.8h + sqrdmulh v24.8h, v28.8h, v1.8h + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + ldr q0, [x2, #528] + ldr q2, [x3, #528] + mov v25.16b, v9.16b + mov v26.16b, v11.16b + trn1 v9.2d, v9.2d, v10.2d + trn1 v11.2d, v11.2d, v12.2d + trn2 v10.2d, v25.2d, v10.2d + trn2 v12.2d, v26.2d, v12.2d + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v2.h[0] + mul v27.8h, v28.8h, v2.h[1] + sqrdmulh v10.8h, v26.8h, v0.h[0] + sqrdmulh v12.8h, v28.8h, v0.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + mov v25.16b, v13.16b + mov v26.16b, v15.16b + trn1 v13.2d, v13.2d, v14.2d + trn1 v15.2d, v15.2d, v16.2d + trn2 v14.2d, v25.2d, v14.2d + trn2 v16.2d, v26.2d, v16.2d + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v2.h[2] + mul v27.8h, v28.8h, v2.h[3] + sqrdmulh v14.8h, v26.8h, v0.h[2] + sqrdmulh v16.8h, v28.8h, v0.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + mov v25.16b, v17.16b + mov v26.16b, v19.16b + trn1 v17.2d, v17.2d, v18.2d + trn1 v19.2d, v19.2d, v20.2d + trn2 v18.2d, v25.2d, v18.2d + trn2 v20.2d, v26.2d, v20.2d + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v2.h[4] + mul v27.8h, v28.8h, v2.h[5] + sqrdmulh v18.8h, v26.8h, v0.h[4] + sqrdmulh v20.8h, v28.8h, v0.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + mov v25.16b, v21.16b + mov v26.16b, v23.16b + trn1 v21.2d, v21.2d, v22.2d + trn1 v23.2d, v23.2d, v24.2d + trn2 v22.2d, v25.2d, v22.2d + trn2 v24.2d, v26.2d, v24.2d + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v2.h[6] + mul v27.8h, v28.8h, v2.h[7] + sqrdmulh v22.8h, v26.8h, v0.h[6] + sqrdmulh v24.8h, v28.8h, v0.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v11.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v11.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v13.8h, v8.h[2] + sqdmulh v26.8h, v15.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v13.8h, v25.8h, v8.h[0] + mls v15.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v19.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v19.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v21.8h, v8.h[2] + sqdmulh v26.8h, v23.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v21.8h, v25.8h, v8.h[0] + mls v23.8h, v26.8h, v8.h[0] + stp q9, q10, [x1] + stp q11, q12, [x1, #32] + stp q13, q14, [x1, #64] + stp q15, q16, [x1, #96] + stp q17, q18, [x1, #128] + stp q19, q20, [x1, #160] + stp q21, q22, [x1, #192] + stp q23, q24, [x1, #224] + ldr q4, [x2, #544] + ldr q5, [x2, #560] + ldr q6, [x3, #544] + ldr q7, [x3, #560] + ldr q9, [x0] + ldr q10, [x0, #32] + ldr q11, [x0, #64] + ldr q12, [x0, #96] + ldr q13, [x0, #128] + ldr q14, [x0, #160] + ldr q15, [x0, #192] + ldr q16, [x0, #224] + ldr q17, [x1] + ldr q18, [x1, #32] + ldr q19, [x1, #64] + ldr q20, [x1, #96] + ldr q21, [x1, #128] + ldr q22, [x1, #160] + ldr q23, [x1, #192] + ldr q24, [x1, #224] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v6.h[0] + mul v27.8h, v28.8h, v6.h[1] + sqrdmulh v10.8h, v26.8h, v4.h[0] + sqrdmulh v12.8h, v28.8h, v4.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v6.h[2] + mul v27.8h, v28.8h, v6.h[3] + sqrdmulh v14.8h, v26.8h, v4.h[2] + sqrdmulh v16.8h, v28.8h, v4.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v6.h[4] + mul v27.8h, v28.8h, v6.h[5] + sqrdmulh v18.8h, v26.8h, v4.h[4] + sqrdmulh v20.8h, v28.8h, v4.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v6.h[6] + mul v27.8h, v28.8h, v6.h[7] + sqrdmulh v22.8h, v26.8h, v4.h[6] + sqrdmulh v24.8h, v28.8h, v4.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v11.8h + sub v28.8h, v10.8h, v12.8h + add v9.8h, v9.8h, v11.8h + add v10.8h, v10.8h, v12.8h + mul v25.8h, v26.8h, v7.h[0] + mul v27.8h, v28.8h, v7.h[0] + sqrdmulh v11.8h, v26.8h, v5.h[0] + sqrdmulh v12.8h, v28.8h, v5.h[0] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v15.8h + sub v28.8h, v14.8h, v16.8h + add v13.8h, v13.8h, v15.8h + add v14.8h, v14.8h, v16.8h + mul v25.8h, v26.8h, v7.h[1] + mul v27.8h, v28.8h, v7.h[1] + sqrdmulh v15.8h, v26.8h, v5.h[1] + sqrdmulh v16.8h, v28.8h, v5.h[1] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v19.8h + sub v28.8h, v18.8h, v20.8h + add v17.8h, v17.8h, v19.8h + add v18.8h, v18.8h, v20.8h + mul v25.8h, v26.8h, v7.h[2] + mul v27.8h, v28.8h, v7.h[2] + sqrdmulh v19.8h, v26.8h, v5.h[2] + sqrdmulh v20.8h, v28.8h, v5.h[2] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v23.8h + sub v28.8h, v22.8h, v24.8h + add v21.8h, v21.8h, v23.8h + add v22.8h, v22.8h, v24.8h + mul v25.8h, v26.8h, v7.h[3] + mul v27.8h, v28.8h, v7.h[3] + sqrdmulh v23.8h, v26.8h, v5.h[3] + sqrdmulh v24.8h, v28.8h, v5.h[3] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v13.8h + sub v28.8h, v10.8h, v14.8h + add v9.8h, v9.8h, v13.8h + add v10.8h, v10.8h, v14.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v13.8h, v26.8h, v5.h[4] + sqrdmulh v14.8h, v28.8h, v5.h[4] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v27.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + sub v26.8h, v11.8h, v15.8h + sub v28.8h, v12.8h, v16.8h + add v11.8h, v11.8h, v15.8h + add v12.8h, v12.8h, v16.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v15.8h, v26.8h, v5.h[4] + sqrdmulh v16.8h, v28.8h, v5.h[4] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v21.8h + sub v28.8h, v18.8h, v22.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v21.8h, v26.8h, v5.h[5] + sqrdmulh v22.8h, v28.8h, v5.h[5] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v19.8h, v23.8h + sub v28.8h, v20.8h, v24.8h + add v19.8h, v19.8h, v23.8h + add v20.8h, v20.8h, v24.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v23.8h, v26.8h, v5.h[5] + sqrdmulh v24.8h, v28.8h, v5.h[5] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v10.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v10.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v11.8h, v8.h[2] + sqdmulh v26.8h, v12.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v11.8h, v25.8h, v8.h[0] + mls v12.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v18.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v18.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v19.8h, v8.h[2] + sqdmulh v26.8h, v20.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v19.8h, v25.8h, v8.h[0] + mls v20.8h, v26.8h, v8.h[0] + sub v26.8h, v9.8h, v17.8h + sub v28.8h, v10.8h, v18.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v17.8h, v26.8h, v5.h[6] + sqrdmulh v18.8h, v28.8h, v5.h[6] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v27.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + sub v26.8h, v11.8h, v19.8h + sub v28.8h, v12.8h, v20.8h + add v11.8h, v11.8h, v19.8h + add v12.8h, v12.8h, v20.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v19.8h, v26.8h, v5.h[6] + sqrdmulh v20.8h, v28.8h, v5.h[6] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v13.8h, v21.8h + sub v28.8h, v14.8h, v22.8h + add v13.8h, v13.8h, v21.8h + add v14.8h, v14.8h, v22.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v21.8h, v26.8h, v5.h[6] + sqrdmulh v22.8h, v28.8h, v5.h[6] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v15.8h, v23.8h + sub v28.8h, v16.8h, v24.8h + add v15.8h, v15.8h, v23.8h + add v16.8h, v16.8h, v24.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v23.8h, v26.8h, v5.h[6] + sqrdmulh v24.8h, v28.8h, v5.h[6] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v25.8h, v9.8h, v7.h[7] + mul v26.8h, v10.8h, v7.h[7] + sqrdmulh v9.8h, v9.8h, v5.h[7] + sqrdmulh v10.8h, v10.8h, v5.h[7] + sqrdmlsh v9.8h, v25.8h, v8.h[0] + sqrdmlsh v10.8h, v26.8h, v8.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v25.8h, v11.8h, v7.h[7] + mul v26.8h, v12.8h, v7.h[7] + sqrdmulh v11.8h, v11.8h, v5.h[7] + sqrdmulh v12.8h, v12.8h, v5.h[7] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v26.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v25.8h, v13.8h, v7.h[7] + mul v26.8h, v14.8h, v7.h[7] + sqrdmulh v13.8h, v13.8h, v5.h[7] + sqrdmulh v14.8h, v14.8h, v5.h[7] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v26.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v25.8h, v15.8h, v7.h[7] + mul v26.8h, v16.8h, v7.h[7] + sqrdmulh v15.8h, v15.8h, v5.h[7] + sqrdmulh v16.8h, v16.8h, v5.h[7] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v26.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + mul v25.8h, v17.8h, v7.h[7] + mul v26.8h, v18.8h, v7.h[7] + sqrdmulh v17.8h, v17.8h, v5.h[7] + sqrdmulh v18.8h, v18.8h, v5.h[7] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v26.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + mul v25.8h, v19.8h, v7.h[7] + mul v26.8h, v20.8h, v7.h[7] + sqrdmulh v19.8h, v19.8h, v5.h[7] + sqrdmulh v20.8h, v20.8h, v5.h[7] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v26.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + mul v25.8h, v21.8h, v7.h[7] + mul v26.8h, v22.8h, v7.h[7] + sqrdmulh v21.8h, v21.8h, v5.h[7] + sqrdmulh v22.8h, v22.8h, v5.h[7] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v26.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + mul v25.8h, v23.8h, v7.h[7] + mul v26.8h, v24.8h, v7.h[7] + sqrdmulh v23.8h, v23.8h, v5.h[7] + sqrdmulh v24.8h, v24.8h, v5.h[7] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v26.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + str q9, [x0] + str q10, [x0, #32] + str q11, [x0, #64] + str q12, [x0, #96] + str q13, [x0, #128] + str q14, [x0, #160] + str q15, [x0, #192] + str q16, [x0, #224] + str q17, [x1] + str q18, [x1, #32] + str q19, [x1, #64] + str q20, [x1, #96] + str q21, [x1, #128] + str q22, [x1, #160] + str q23, [x1, #192] + str q24, [x1, #224] + ldr q9, [x0, #16] + ldr q10, [x0, #48] + ldr q11, [x0, #80] + ldr q12, [x0, #112] + ldr q13, [x0, #144] + ldr q14, [x0, #176] + ldr q15, [x0, #208] + ldr q16, [x0, #240] + ldr q17, [x1, #16] + ldr q18, [x1, #48] + ldr q19, [x1, #80] + ldr q20, [x1, #112] + ldr q21, [x1, #144] + ldr q22, [x1, #176] + ldr q23, [x1, #208] + ldr q24, [x1, #240] + sub v26.8h, v9.8h, v10.8h + sub v28.8h, v11.8h, v12.8h + add v9.8h, v9.8h, v10.8h + add v11.8h, v11.8h, v12.8h + mul v25.8h, v26.8h, v6.h[0] + mul v27.8h, v28.8h, v6.h[1] + sqrdmulh v10.8h, v26.8h, v4.h[0] + sqrdmulh v12.8h, v28.8h, v4.h[1] + sqrdmlsh v10.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v10.8h, v10.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v14.8h + sub v28.8h, v15.8h, v16.8h + add v13.8h, v13.8h, v14.8h + add v15.8h, v15.8h, v16.8h + mul v25.8h, v26.8h, v6.h[2] + mul v27.8h, v28.8h, v6.h[3] + sqrdmulh v14.8h, v26.8h, v4.h[2] + sqrdmulh v16.8h, v28.8h, v4.h[3] + sqrdmlsh v14.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v14.8h, v14.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v18.8h + sub v28.8h, v19.8h, v20.8h + add v17.8h, v17.8h, v18.8h + add v19.8h, v19.8h, v20.8h + mul v25.8h, v26.8h, v6.h[4] + mul v27.8h, v28.8h, v6.h[5] + sqrdmulh v18.8h, v26.8h, v4.h[4] + sqrdmulh v20.8h, v28.8h, v4.h[5] + sqrdmlsh v18.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v18.8h, v18.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v22.8h + sub v28.8h, v23.8h, v24.8h + add v21.8h, v21.8h, v22.8h + add v23.8h, v23.8h, v24.8h + mul v25.8h, v26.8h, v6.h[6] + mul v27.8h, v28.8h, v6.h[7] + sqrdmulh v22.8h, v26.8h, v4.h[6] + sqrdmulh v24.8h, v28.8h, v4.h[7] + sqrdmlsh v22.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v22.8h, v22.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v11.8h + sub v28.8h, v10.8h, v12.8h + add v9.8h, v9.8h, v11.8h + add v10.8h, v10.8h, v12.8h + mul v25.8h, v26.8h, v7.h[0] + mul v27.8h, v28.8h, v7.h[0] + sqrdmulh v11.8h, v26.8h, v5.h[0] + sqrdmulh v12.8h, v28.8h, v5.h[0] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v27.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + sub v26.8h, v13.8h, v15.8h + sub v28.8h, v14.8h, v16.8h + add v13.8h, v13.8h, v15.8h + add v14.8h, v14.8h, v16.8h + mul v25.8h, v26.8h, v7.h[1] + mul v27.8h, v28.8h, v7.h[1] + sqrdmulh v15.8h, v26.8h, v5.h[1] + sqrdmulh v16.8h, v28.8h, v5.h[1] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v19.8h + sub v28.8h, v18.8h, v20.8h + add v17.8h, v17.8h, v19.8h + add v18.8h, v18.8h, v20.8h + mul v25.8h, v26.8h, v7.h[2] + mul v27.8h, v28.8h, v7.h[2] + sqrdmulh v19.8h, v26.8h, v5.h[2] + sqrdmulh v20.8h, v28.8h, v5.h[2] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v21.8h, v23.8h + sub v28.8h, v22.8h, v24.8h + add v21.8h, v21.8h, v23.8h + add v22.8h, v22.8h, v24.8h + mul v25.8h, v26.8h, v7.h[3] + mul v27.8h, v28.8h, v7.h[3] + sqrdmulh v23.8h, v26.8h, v5.h[3] + sqrdmulh v24.8h, v28.8h, v5.h[3] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sub v26.8h, v9.8h, v13.8h + sub v28.8h, v10.8h, v14.8h + add v9.8h, v9.8h, v13.8h + add v10.8h, v10.8h, v14.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v13.8h, v26.8h, v5.h[4] + sqrdmulh v14.8h, v28.8h, v5.h[4] + sqrdmlsh v13.8h, v25.8h, v8.h[0] + sqrdmlsh v14.8h, v27.8h, v8.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + sub v26.8h, v11.8h, v15.8h + sub v28.8h, v12.8h, v16.8h + add v11.8h, v11.8h, v15.8h + add v12.8h, v12.8h, v16.8h + mul v25.8h, v26.8h, v7.h[4] + mul v27.8h, v28.8h, v7.h[4] + sqrdmulh v15.8h, v26.8h, v5.h[4] + sqrdmulh v16.8h, v28.8h, v5.h[4] + sqrdmlsh v15.8h, v25.8h, v8.h[0] + sqrdmlsh v16.8h, v27.8h, v8.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + sub v26.8h, v17.8h, v21.8h + sub v28.8h, v18.8h, v22.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v21.8h, v26.8h, v5.h[5] + sqrdmulh v22.8h, v28.8h, v5.h[5] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v19.8h, v23.8h + sub v28.8h, v20.8h, v24.8h + add v19.8h, v19.8h, v23.8h + add v20.8h, v20.8h, v24.8h + mul v25.8h, v26.8h, v7.h[5] + mul v27.8h, v28.8h, v7.h[5] + sqrdmulh v23.8h, v26.8h, v5.h[5] + sqrdmulh v24.8h, v28.8h, v5.h[5] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + sqdmulh v25.8h, v9.8h, v8.h[2] + sqdmulh v26.8h, v10.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v9.8h, v25.8h, v8.h[0] + mls v10.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v11.8h, v8.h[2] + sqdmulh v26.8h, v12.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v11.8h, v25.8h, v8.h[0] + mls v12.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v17.8h, v8.h[2] + sqdmulh v26.8h, v18.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v17.8h, v25.8h, v8.h[0] + mls v18.8h, v26.8h, v8.h[0] + sqdmulh v25.8h, v19.8h, v8.h[2] + sqdmulh v26.8h, v20.8h, v8.h[2] + sshr v25.8h, v25.8h, #11 + sshr v26.8h, v26.8h, #11 + mls v19.8h, v25.8h, v8.h[0] + mls v20.8h, v26.8h, v8.h[0] + sub v26.8h, v9.8h, v17.8h + sub v28.8h, v10.8h, v18.8h + add v9.8h, v9.8h, v17.8h + add v10.8h, v10.8h, v18.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v17.8h, v26.8h, v5.h[6] + sqrdmulh v18.8h, v28.8h, v5.h[6] + sqrdmlsh v17.8h, v25.8h, v8.h[0] + sqrdmlsh v18.8h, v27.8h, v8.h[0] + sshr v17.8h, v17.8h, #1 + sshr v18.8h, v18.8h, #1 + sub v26.8h, v11.8h, v19.8h + sub v28.8h, v12.8h, v20.8h + add v11.8h, v11.8h, v19.8h + add v12.8h, v12.8h, v20.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v19.8h, v26.8h, v5.h[6] + sqrdmulh v20.8h, v28.8h, v5.h[6] + sqrdmlsh v19.8h, v25.8h, v8.h[0] + sqrdmlsh v20.8h, v27.8h, v8.h[0] + sshr v19.8h, v19.8h, #1 + sshr v20.8h, v20.8h, #1 + sub v26.8h, v13.8h, v21.8h + sub v28.8h, v14.8h, v22.8h + add v13.8h, v13.8h, v21.8h + add v14.8h, v14.8h, v22.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v21.8h, v26.8h, v5.h[6] + sqrdmulh v22.8h, v28.8h, v5.h[6] + sqrdmlsh v21.8h, v25.8h, v8.h[0] + sqrdmlsh v22.8h, v27.8h, v8.h[0] + sshr v21.8h, v21.8h, #1 + sshr v22.8h, v22.8h, #1 + sub v26.8h, v15.8h, v23.8h + sub v28.8h, v16.8h, v24.8h + add v15.8h, v15.8h, v23.8h + add v16.8h, v16.8h, v24.8h + mul v25.8h, v26.8h, v7.h[6] + mul v27.8h, v28.8h, v7.h[6] + sqrdmulh v23.8h, v26.8h, v5.h[6] + sqrdmulh v24.8h, v28.8h, v5.h[6] + sqrdmlsh v23.8h, v25.8h, v8.h[0] + sqrdmlsh v24.8h, v27.8h, v8.h[0] + sshr v23.8h, v23.8h, #1 + sshr v24.8h, v24.8h, #1 + mul v25.8h, v9.8h, v7.h[7] + mul v26.8h, v10.8h, v7.h[7] + sqrdmulh v9.8h, v9.8h, v5.h[7] + sqrdmulh v10.8h, v10.8h, v5.h[7] + sqrdmlsh v9.8h, v25.8h, v8.h[0] + sqrdmlsh v10.8h, v26.8h, v8.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v25.8h, v11.8h, v7.h[7] + mul v26.8h, v12.8h, v7.h[7] + sqrdmulh v11.8h, v11.8h, v5.h[7] + sqrdmulh v12.8h, v12.8h, v5.h[7] + sqrdmlsh v11.8h, v25.8h, v8.h[0] + sqrdmlsh v12.8h, v26.8h, v8.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v25.8h, v13.8h, v7.h[7] + mul v26.8h, v14.8h, v7.h[7] + sqrdmulh v13.8h, v13.8h, v5.h[7] + sqrdmulh v14.8h, v14.8h, v5.h[7] sqrdmlsh v13.8h, v25.8h, v8.h[0] sqrdmlsh v14.8h, v26.8h, v8.h[0] -#else - sqrdmulh v25.8h, v25.8h, v8.h[0] - sqrdmulh v26.8h, v26.8h, v8.h[0] - sub v13.8h, v13.8h, v25.8h - sub v14.8h, v14.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v25.8h, v15.8h, v7.h[7] mul v26.8h, v16.8h, v7.h[7] sqrdmulh v15.8h, v15.8h, v5.h[7] sqrdmulh v16.8h, v16.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v26.8h, v8.h[0] -#else - sqrdmulh v25.8h, v25.8h, v8.h[0] - sqrdmulh v26.8h, v26.8h, v8.h[0] - sub v15.8h, v15.8h, v25.8h - sub v16.8h, v16.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 mul v25.8h, v17.8h, v7.h[7] mul v26.8h, v18.8h, v7.h[7] sqrdmulh v17.8h, v17.8h, v5.h[7] sqrdmulh v18.8h, v18.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH sqrdmlsh v17.8h, v25.8h, v8.h[0] sqrdmlsh v18.8h, v26.8h, v8.h[0] -#else - sqrdmulh v25.8h, v25.8h, v8.h[0] - sqrdmulh v26.8h, v26.8h, v8.h[0] - sub v17.8h, v17.8h, v25.8h - sub v18.8h, v18.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 mul v25.8h, v19.8h, v7.h[7] mul v26.8h, v20.8h, v7.h[7] sqrdmulh v19.8h, v19.8h, v5.h[7] sqrdmulh v20.8h, v20.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v26.8h, v8.h[0] -#else - sqrdmulh v25.8h, v25.8h, v8.h[0] - sqrdmulh v26.8h, v26.8h, v8.h[0] - sub v19.8h, v19.8h, v25.8h - sub v20.8h, v20.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 mul v25.8h, v21.8h, v7.h[7] mul v26.8h, v22.8h, v7.h[7] sqrdmulh v21.8h, v21.8h, v5.h[7] sqrdmulh v22.8h, v22.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v26.8h, v8.h[0] -#else - sqrdmulh v25.8h, v25.8h, v8.h[0] - sqrdmulh v26.8h, v26.8h, v8.h[0] - sub v21.8h, v21.8h, v25.8h - sub v22.8h, v22.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v25.8h, v23.8h, v7.h[7] mul v26.8h, v24.8h, v7.h[7] sqrdmulh v23.8h, v23.8h, v5.h[7] sqrdmulh v24.8h, v24.8h, v5.h[7] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v26.8h, v8.h[0] -#else - sqrdmulh v25.8h, v25.8h, v8.h[0] - sqrdmulh v26.8h, v26.8h, v8.h[0] - sub v23.8h, v23.8h, v25.8h - sub v24.8h, v24.8h, v26.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 str q9, [x0, #16] @@ -3682,8 +5531,9 @@ _kyber_invntt: ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ - .size kyber_invntt,.-kyber_invntt + .size kyber_invntt_sqrdmlsh,.-kyber_invntt_sqrdmlsh #endif /* __APPLE__ */ +#endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */ #ifndef __APPLE__ .text .type L_kyber_aarch64_zetas_mul, %object @@ -5987,120 +7837,80 @@ _kyber_to_mont: mul v18.8h, v2.8h, v0.h[4] sqrdmulh v1.8h, v1.8h, v0.h[3] sqrdmulh v2.8h, v2.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v1.8h, v17.8h, v0.h[0] - sqrdmlsh v2.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v1.8h, v1.8h, v17.8h sub v2.8h, v2.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v1.8h, v1.8h, #1 sshr v2.8h, v2.8h, #1 mul v17.8h, v3.8h, v0.h[4] mul v18.8h, v4.8h, v0.h[4] sqrdmulh v3.8h, v3.8h, v0.h[3] sqrdmulh v4.8h, v4.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v3.8h, v17.8h, v0.h[0] - sqrdmlsh v4.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v3.8h, v3.8h, v17.8h sub v4.8h, v4.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v3.8h, v3.8h, #1 sshr v4.8h, v4.8h, #1 mul v17.8h, v5.8h, v0.h[4] mul v18.8h, v6.8h, v0.h[4] sqrdmulh v5.8h, v5.8h, v0.h[3] sqrdmulh v6.8h, v6.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v5.8h, v17.8h, v0.h[0] - sqrdmlsh v6.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v5.8h, v5.8h, v17.8h sub v6.8h, v6.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v5.8h, v5.8h, #1 sshr v6.8h, v6.8h, #1 mul v17.8h, v7.8h, v0.h[4] mul v18.8h, v8.8h, v0.h[4] sqrdmulh v7.8h, v7.8h, v0.h[3] sqrdmulh v8.8h, v8.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v7.8h, v17.8h, v0.h[0] - sqrdmlsh v8.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v7.8h, v7.8h, v17.8h sub v8.8h, v8.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v7.8h, v7.8h, #1 sshr v8.8h, v8.8h, #1 mul v17.8h, v9.8h, v0.h[4] mul v18.8h, v10.8h, v0.h[4] sqrdmulh v9.8h, v9.8h, v0.h[3] sqrdmulh v10.8h, v10.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v9.8h, v17.8h, v0.h[0] - sqrdmlsh v10.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v9.8h, v9.8h, v17.8h sub v10.8h, v10.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v17.8h, v11.8h, v0.h[4] mul v18.8h, v12.8h, v0.h[4] sqrdmulh v11.8h, v11.8h, v0.h[3] sqrdmulh v12.8h, v12.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v11.8h, v17.8h, v0.h[0] - sqrdmlsh v12.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v11.8h, v11.8h, v17.8h sub v12.8h, v12.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v17.8h, v13.8h, v0.h[4] mul v18.8h, v14.8h, v0.h[4] sqrdmulh v13.8h, v13.8h, v0.h[3] sqrdmulh v14.8h, v14.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v13.8h, v17.8h, v0.h[0] - sqrdmlsh v14.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v13.8h, v13.8h, v17.8h sub v14.8h, v14.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v17.8h, v15.8h, v0.h[4] mul v18.8h, v16.8h, v0.h[4] sqrdmulh v15.8h, v15.8h, v0.h[3] sqrdmulh v16.8h, v16.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v17.8h, v0.h[0] - sqrdmlsh v16.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v15.8h, v15.8h, v17.8h sub v16.8h, v16.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 @@ -6116,120 +7926,80 @@ _kyber_to_mont: mul v18.8h, v2.8h, v0.h[4] sqrdmulh v1.8h, v1.8h, v0.h[3] sqrdmulh v2.8h, v2.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v1.8h, v17.8h, v0.h[0] - sqrdmlsh v2.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v1.8h, v1.8h, v17.8h sub v2.8h, v2.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v1.8h, v1.8h, #1 sshr v2.8h, v2.8h, #1 mul v17.8h, v3.8h, v0.h[4] mul v18.8h, v4.8h, v0.h[4] sqrdmulh v3.8h, v3.8h, v0.h[3] sqrdmulh v4.8h, v4.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v3.8h, v17.8h, v0.h[0] - sqrdmlsh v4.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v3.8h, v3.8h, v17.8h sub v4.8h, v4.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v3.8h, v3.8h, #1 sshr v4.8h, v4.8h, #1 mul v17.8h, v5.8h, v0.h[4] mul v18.8h, v6.8h, v0.h[4] sqrdmulh v5.8h, v5.8h, v0.h[3] sqrdmulh v6.8h, v6.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v5.8h, v17.8h, v0.h[0] - sqrdmlsh v6.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v5.8h, v5.8h, v17.8h sub v6.8h, v6.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v5.8h, v5.8h, #1 sshr v6.8h, v6.8h, #1 mul v17.8h, v7.8h, v0.h[4] mul v18.8h, v8.8h, v0.h[4] sqrdmulh v7.8h, v7.8h, v0.h[3] sqrdmulh v8.8h, v8.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v7.8h, v17.8h, v0.h[0] - sqrdmlsh v8.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v7.8h, v7.8h, v17.8h sub v8.8h, v8.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v7.8h, v7.8h, #1 sshr v8.8h, v8.8h, #1 mul v17.8h, v9.8h, v0.h[4] mul v18.8h, v10.8h, v0.h[4] sqrdmulh v9.8h, v9.8h, v0.h[3] sqrdmulh v10.8h, v10.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v9.8h, v17.8h, v0.h[0] - sqrdmlsh v10.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v9.8h, v9.8h, v17.8h sub v10.8h, v10.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v17.8h, v11.8h, v0.h[4] mul v18.8h, v12.8h, v0.h[4] sqrdmulh v11.8h, v11.8h, v0.h[3] sqrdmulh v12.8h, v12.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v11.8h, v17.8h, v0.h[0] - sqrdmlsh v12.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v11.8h, v11.8h, v17.8h sub v12.8h, v12.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v17.8h, v13.8h, v0.h[4] mul v18.8h, v14.8h, v0.h[4] sqrdmulh v13.8h, v13.8h, v0.h[3] sqrdmulh v14.8h, v14.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v13.8h, v17.8h, v0.h[0] - sqrdmlsh v14.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v13.8h, v13.8h, v17.8h sub v14.8h, v14.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v17.8h, v15.8h, v0.h[4] mul v18.8h, v16.8h, v0.h[4] sqrdmulh v15.8h, v15.8h, v0.h[3] sqrdmulh v16.8h, v16.8h, v0.h[3] -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - sqrdmlsh v15.8h, v17.8h, v0.h[0] - sqrdmlsh v16.8h, v18.8h, v0.h[0] -#else sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v15.8h, v15.8h, v17.8h sub v16.8h, v16.8h, v18.8h -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 @@ -6245,6 +8015,189 @@ _kyber_to_mont: #ifndef __APPLE__ .size kyber_to_mont,.-kyber_to_mont #endif /* __APPLE__ */ +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH +#ifndef __APPLE__ +.text +.globl kyber_to_mont_sqrdmlsh +.type kyber_to_mont_sqrdmlsh,@function +.align 2 +kyber_to_mont_sqrdmlsh: +#else +.section __TEXT,__text +.globl _kyber_to_mont_sqrdmlsh +.p2align 2 +_kyber_to_mont_sqrdmlsh: +#endif /* __APPLE__ */ + stp x29, x30, [sp, #-80]! + add x29, sp, #0 + stp d8, d9, [x29, #16] + stp d10, d11, [x29, #32] + stp d12, d13, [x29, #48] + stp d14, d15, [x29, #64] +#ifndef __APPLE__ + adrp x1, L_kyber_aarch64_consts + add x1, x1, :lo12:L_kyber_aarch64_consts +#else + adrp x1, L_kyber_aarch64_consts@PAGE + add x1, x1, :lo12:L_kyber_aarch64_consts@PAGEOFF +#endif /* __APPLE__ */ + ldr q0, [x1] + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + sub x0, x0, #0x100 + mul v17.8h, v1.8h, v0.h[4] + mul v18.8h, v2.8h, v0.h[4] + sqrdmulh v1.8h, v1.8h, v0.h[3] + sqrdmulh v2.8h, v2.8h, v0.h[3] + sqrdmlsh v1.8h, v17.8h, v0.h[0] + sqrdmlsh v2.8h, v18.8h, v0.h[0] + sshr v1.8h, v1.8h, #1 + sshr v2.8h, v2.8h, #1 + mul v17.8h, v3.8h, v0.h[4] + mul v18.8h, v4.8h, v0.h[4] + sqrdmulh v3.8h, v3.8h, v0.h[3] + sqrdmulh v4.8h, v4.8h, v0.h[3] + sqrdmlsh v3.8h, v17.8h, v0.h[0] + sqrdmlsh v4.8h, v18.8h, v0.h[0] + sshr v3.8h, v3.8h, #1 + sshr v4.8h, v4.8h, #1 + mul v17.8h, v5.8h, v0.h[4] + mul v18.8h, v6.8h, v0.h[4] + sqrdmulh v5.8h, v5.8h, v0.h[3] + sqrdmulh v6.8h, v6.8h, v0.h[3] + sqrdmlsh v5.8h, v17.8h, v0.h[0] + sqrdmlsh v6.8h, v18.8h, v0.h[0] + sshr v5.8h, v5.8h, #1 + sshr v6.8h, v6.8h, #1 + mul v17.8h, v7.8h, v0.h[4] + mul v18.8h, v8.8h, v0.h[4] + sqrdmulh v7.8h, v7.8h, v0.h[3] + sqrdmulh v8.8h, v8.8h, v0.h[3] + sqrdmlsh v7.8h, v17.8h, v0.h[0] + sqrdmlsh v8.8h, v18.8h, v0.h[0] + sshr v7.8h, v7.8h, #1 + sshr v8.8h, v8.8h, #1 + mul v17.8h, v9.8h, v0.h[4] + mul v18.8h, v10.8h, v0.h[4] + sqrdmulh v9.8h, v9.8h, v0.h[3] + sqrdmulh v10.8h, v10.8h, v0.h[3] + sqrdmlsh v9.8h, v17.8h, v0.h[0] + sqrdmlsh v10.8h, v18.8h, v0.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v17.8h, v11.8h, v0.h[4] + mul v18.8h, v12.8h, v0.h[4] + sqrdmulh v11.8h, v11.8h, v0.h[3] + sqrdmulh v12.8h, v12.8h, v0.h[3] + sqrdmlsh v11.8h, v17.8h, v0.h[0] + sqrdmlsh v12.8h, v18.8h, v0.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v17.8h, v13.8h, v0.h[4] + mul v18.8h, v14.8h, v0.h[4] + sqrdmulh v13.8h, v13.8h, v0.h[3] + sqrdmulh v14.8h, v14.8h, v0.h[3] + sqrdmlsh v13.8h, v17.8h, v0.h[0] + sqrdmlsh v14.8h, v18.8h, v0.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v17.8h, v15.8h, v0.h[4] + mul v18.8h, v16.8h, v0.h[4] + sqrdmulh v15.8h, v15.8h, v0.h[3] + sqrdmulh v16.8h, v16.8h, v0.h[3] + sqrdmlsh v15.8h, v17.8h, v0.h[0] + sqrdmlsh v16.8h, v18.8h, v0.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + sub x0, x0, #0x100 + mul v17.8h, v1.8h, v0.h[4] + mul v18.8h, v2.8h, v0.h[4] + sqrdmulh v1.8h, v1.8h, v0.h[3] + sqrdmulh v2.8h, v2.8h, v0.h[3] + sqrdmlsh v1.8h, v17.8h, v0.h[0] + sqrdmlsh v2.8h, v18.8h, v0.h[0] + sshr v1.8h, v1.8h, #1 + sshr v2.8h, v2.8h, #1 + mul v17.8h, v3.8h, v0.h[4] + mul v18.8h, v4.8h, v0.h[4] + sqrdmulh v3.8h, v3.8h, v0.h[3] + sqrdmulh v4.8h, v4.8h, v0.h[3] + sqrdmlsh v3.8h, v17.8h, v0.h[0] + sqrdmlsh v4.8h, v18.8h, v0.h[0] + sshr v3.8h, v3.8h, #1 + sshr v4.8h, v4.8h, #1 + mul v17.8h, v5.8h, v0.h[4] + mul v18.8h, v6.8h, v0.h[4] + sqrdmulh v5.8h, v5.8h, v0.h[3] + sqrdmulh v6.8h, v6.8h, v0.h[3] + sqrdmlsh v5.8h, v17.8h, v0.h[0] + sqrdmlsh v6.8h, v18.8h, v0.h[0] + sshr v5.8h, v5.8h, #1 + sshr v6.8h, v6.8h, #1 + mul v17.8h, v7.8h, v0.h[4] + mul v18.8h, v8.8h, v0.h[4] + sqrdmulh v7.8h, v7.8h, v0.h[3] + sqrdmulh v8.8h, v8.8h, v0.h[3] + sqrdmlsh v7.8h, v17.8h, v0.h[0] + sqrdmlsh v8.8h, v18.8h, v0.h[0] + sshr v7.8h, v7.8h, #1 + sshr v8.8h, v8.8h, #1 + mul v17.8h, v9.8h, v0.h[4] + mul v18.8h, v10.8h, v0.h[4] + sqrdmulh v9.8h, v9.8h, v0.h[3] + sqrdmulh v10.8h, v10.8h, v0.h[3] + sqrdmlsh v9.8h, v17.8h, v0.h[0] + sqrdmlsh v10.8h, v18.8h, v0.h[0] + sshr v9.8h, v9.8h, #1 + sshr v10.8h, v10.8h, #1 + mul v17.8h, v11.8h, v0.h[4] + mul v18.8h, v12.8h, v0.h[4] + sqrdmulh v11.8h, v11.8h, v0.h[3] + sqrdmulh v12.8h, v12.8h, v0.h[3] + sqrdmlsh v11.8h, v17.8h, v0.h[0] + sqrdmlsh v12.8h, v18.8h, v0.h[0] + sshr v11.8h, v11.8h, #1 + sshr v12.8h, v12.8h, #1 + mul v17.8h, v13.8h, v0.h[4] + mul v18.8h, v14.8h, v0.h[4] + sqrdmulh v13.8h, v13.8h, v0.h[3] + sqrdmulh v14.8h, v14.8h, v0.h[3] + sqrdmlsh v13.8h, v17.8h, v0.h[0] + sqrdmlsh v14.8h, v18.8h, v0.h[0] + sshr v13.8h, v13.8h, #1 + sshr v14.8h, v14.8h, #1 + mul v17.8h, v15.8h, v0.h[4] + mul v18.8h, v16.8h, v0.h[4] + sqrdmulh v15.8h, v15.8h, v0.h[3] + sqrdmulh v16.8h, v16.8h, v0.h[3] + sqrdmlsh v15.8h, v17.8h, v0.h[0] + sqrdmlsh v16.8h, v18.8h, v0.h[0] + sshr v15.8h, v15.8h, #1 + sshr v16.8h, v16.8h, #1 + st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 + st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 + st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 + st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 + ldp d8, d9, [x29, #16] + ldp d10, d11, [x29, #32] + ldp d12, d13, [x29, #48] + ldp d14, d15, [x29, #64] + ldp x29, x30, [sp], #0x50 + ret +#ifndef __APPLE__ + .size kyber_to_mont_sqrdmlsh,.-kyber_to_mont_sqrdmlsh +#endif /* __APPLE__ */ +#endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */ #ifndef __APPLE__ .text .type L_kyber_aarch64_to_msg_neon_low, %object diff --git a/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c b/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c index ff8d4042fe..b120b6cd7b 100644 --- a/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-kyber-asm_c.c @@ -33,637 +33,107 @@ #ifdef __aarch64__ #ifdef WOLFSSL_ARMASM_INLINE static const word16 L_kyber_aarch64_q[] = { - 0xd01, - 0xd01, - 0xd01, - 0xd01, - 0xd01, - 0xd01, - 0xd01, - 0xd01, + 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, 0x0d01, }; static const word16 L_kyber_aarch64_consts[] = { - 0xd01, - 0xf301, - 0x4ebf, - 0x549, - 0x5049, - 0x0, - 0x0, - 0x0, + 0x0d01, 0xf301, 0x4ebf, 0x0549, 0x5049, 0x0000, 0x0000, 0x0000, }; static const word64 L_sha3_aarch64_r[] = { - 0x1UL, - 0x8082UL, - 0x800000000000808aUL, - 0x8000000080008000UL, - 0x808bUL, - 0x80000001UL, - 0x8000000080008081UL, - 0x8000000000008009UL, - 0x8aUL, - 0x88UL, - 0x80008009UL, - 0x8000000aUL, - 0x8000808bUL, - 0x800000000000008bUL, - 0x8000000000008089UL, - 0x8000000000008003UL, - 0x8000000000008002UL, - 0x8000000000000080UL, - 0x800aUL, - 0x800000008000000aUL, - 0x8000000080008081UL, - 0x8000000000008080UL, - 0x80000001UL, - 0x8000000080008008UL, + 0x0000000000000001, 0x0000000000008082, + 0x800000000000808a, 0x8000000080008000, + 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, + 0x000000000000008a, 0x0000000000000088, + 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, + 0x8000000000008089, 0x8000000000008003, + 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, + 0x8000000080008081, 0x8000000000008080, + 0x0000000080000001, 0x8000000080008008, }; #include #ifdef WOLFSSL_WC_KYBER static const word16 L_kyber_aarch64_zetas[] = { - 0x8ed, - 0xa0b, - 0xb9a, - 0x714, - 0x5d5, - 0x58e, - 0x11f, - 0xca, - 0xc56, - 0x26e, - 0x629, - 0xb6, - 0x3c2, - 0x84f, - 0x73f, - 0x5bc, - 0x23d, - 0x7d4, - 0x108, - 0x17f, - 0x9c4, - 0x5b2, - 0x6bf, - 0xc7f, - 0xa58, - 0x3f9, - 0x2dc, - 0x260, - 0x6fb, - 0x19b, - 0xc34, - 0x6de, - 0x4c7, - 0x4c7, - 0x4c7, - 0x4c7, - 0x28c, - 0x28c, - 0x28c, - 0x28c, - 0xad9, - 0xad9, - 0xad9, - 0xad9, - 0x3f7, - 0x3f7, - 0x3f7, - 0x3f7, - 0x7f4, - 0x7f4, - 0x7f4, - 0x7f4, - 0x5d3, - 0x5d3, - 0x5d3, - 0x5d3, - 0xbe7, - 0xbe7, - 0xbe7, - 0xbe7, - 0x6f9, - 0x6f9, - 0x6f9, - 0x6f9, - 0x204, - 0x204, - 0x204, - 0x204, - 0xcf9, - 0xcf9, - 0xcf9, - 0xcf9, - 0xbc1, - 0xbc1, - 0xbc1, - 0xbc1, - 0xa67, - 0xa67, - 0xa67, - 0xa67, - 0x6af, - 0x6af, - 0x6af, - 0x6af, - 0x877, - 0x877, - 0x877, - 0x877, - 0x7e, - 0x7e, - 0x7e, - 0x7e, - 0x5bd, - 0x5bd, - 0x5bd, - 0x5bd, - 0x9ac, - 0x9ac, - 0x9ac, - 0x9ac, - 0xca7, - 0xca7, - 0xca7, - 0xca7, - 0xbf2, - 0xbf2, - 0xbf2, - 0xbf2, - 0x33e, - 0x33e, - 0x33e, - 0x33e, - 0x6b, - 0x6b, - 0x6b, - 0x6b, - 0x774, - 0x774, - 0x774, - 0x774, - 0xc0a, - 0xc0a, - 0xc0a, - 0xc0a, - 0x94a, - 0x94a, - 0x94a, - 0x94a, - 0xb73, - 0xb73, - 0xb73, - 0xb73, - 0x3c1, - 0x3c1, - 0x3c1, - 0x3c1, - 0x71d, - 0x71d, - 0x71d, - 0x71d, - 0xa2c, - 0xa2c, - 0xa2c, - 0xa2c, - 0x1c0, - 0x1c0, - 0x1c0, - 0x1c0, - 0x8d8, - 0x8d8, - 0x8d8, - 0x8d8, - 0x2a5, - 0x2a5, - 0x2a5, - 0x2a5, - 0x806, - 0x806, - 0x806, - 0x806, - 0x8b2, - 0x8b2, - 0x1ae, - 0x1ae, - 0x22b, - 0x22b, - 0x34b, - 0x34b, - 0x81e, - 0x81e, - 0x367, - 0x367, - 0x60e, - 0x60e, - 0x69, - 0x69, - 0x1a6, - 0x1a6, - 0x24b, - 0x24b, - 0xb1, - 0xb1, - 0xc16, - 0xc16, - 0xbde, - 0xbde, - 0xb35, - 0xb35, - 0x626, - 0x626, - 0x675, - 0x675, - 0xc0b, - 0xc0b, - 0x30a, - 0x30a, - 0x487, - 0x487, - 0xc6e, - 0xc6e, - 0x9f8, - 0x9f8, - 0x5cb, - 0x5cb, - 0xaa7, - 0xaa7, - 0x45f, - 0x45f, - 0x6cb, - 0x6cb, - 0x284, - 0x284, - 0x999, - 0x999, - 0x15d, - 0x15d, - 0x1a2, - 0x1a2, - 0x149, - 0x149, - 0xc65, - 0xc65, - 0xcb6, - 0xcb6, - 0x331, - 0x331, - 0x449, - 0x449, - 0x25b, - 0x25b, - 0x262, - 0x262, - 0x52a, - 0x52a, - 0x7fc, - 0x7fc, - 0x748, - 0x748, - 0x180, - 0x180, - 0x842, - 0x842, - 0xc79, - 0xc79, - 0x4c2, - 0x4c2, - 0x7ca, - 0x7ca, - 0x997, - 0x997, - 0xdc, - 0xdc, - 0x85e, - 0x85e, - 0x686, - 0x686, - 0x860, - 0x860, - 0x707, - 0x707, - 0x803, - 0x803, - 0x31a, - 0x31a, - 0x71b, - 0x71b, - 0x9ab, - 0x9ab, - 0x99b, - 0x99b, - 0x1de, - 0x1de, - 0xc95, - 0xc95, - 0xbcd, - 0xbcd, - 0x3e4, - 0x3e4, - 0x3df, - 0x3df, - 0x3be, - 0x3be, - 0x74d, - 0x74d, - 0x5f2, - 0x5f2, - 0x65c, - 0x65c, + 0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca, + 0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc, + 0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f, + 0x0a58, 0x03f9, 0x02dc, 0x0260, 0x06fb, 0x019b, 0x0c34, 0x06de, + 0x04c7, 0x04c7, 0x04c7, 0x04c7, 0x028c, 0x028c, 0x028c, 0x028c, + 0x0ad9, 0x0ad9, 0x0ad9, 0x0ad9, 0x03f7, 0x03f7, 0x03f7, 0x03f7, + 0x07f4, 0x07f4, 0x07f4, 0x07f4, 0x05d3, 0x05d3, 0x05d3, 0x05d3, + 0x0be7, 0x0be7, 0x0be7, 0x0be7, 0x06f9, 0x06f9, 0x06f9, 0x06f9, + 0x0204, 0x0204, 0x0204, 0x0204, 0x0cf9, 0x0cf9, 0x0cf9, 0x0cf9, + 0x0bc1, 0x0bc1, 0x0bc1, 0x0bc1, 0x0a67, 0x0a67, 0x0a67, 0x0a67, + 0x06af, 0x06af, 0x06af, 0x06af, 0x0877, 0x0877, 0x0877, 0x0877, + 0x007e, 0x007e, 0x007e, 0x007e, 0x05bd, 0x05bd, 0x05bd, 0x05bd, + 0x09ac, 0x09ac, 0x09ac, 0x09ac, 0x0ca7, 0x0ca7, 0x0ca7, 0x0ca7, + 0x0bf2, 0x0bf2, 0x0bf2, 0x0bf2, 0x033e, 0x033e, 0x033e, 0x033e, + 0x006b, 0x006b, 0x006b, 0x006b, 0x0774, 0x0774, 0x0774, 0x0774, + 0x0c0a, 0x0c0a, 0x0c0a, 0x0c0a, 0x094a, 0x094a, 0x094a, 0x094a, + 0x0b73, 0x0b73, 0x0b73, 0x0b73, 0x03c1, 0x03c1, 0x03c1, 0x03c1, + 0x071d, 0x071d, 0x071d, 0x071d, 0x0a2c, 0x0a2c, 0x0a2c, 0x0a2c, + 0x01c0, 0x01c0, 0x01c0, 0x01c0, 0x08d8, 0x08d8, 0x08d8, 0x08d8, + 0x02a5, 0x02a5, 0x02a5, 0x02a5, 0x0806, 0x0806, 0x0806, 0x0806, + 0x08b2, 0x08b2, 0x01ae, 0x01ae, 0x022b, 0x022b, 0x034b, 0x034b, + 0x081e, 0x081e, 0x0367, 0x0367, 0x060e, 0x060e, 0x0069, 0x0069, + 0x01a6, 0x01a6, 0x024b, 0x024b, 0x00b1, 0x00b1, 0x0c16, 0x0c16, + 0x0bde, 0x0bde, 0x0b35, 0x0b35, 0x0626, 0x0626, 0x0675, 0x0675, + 0x0c0b, 0x0c0b, 0x030a, 0x030a, 0x0487, 0x0487, 0x0c6e, 0x0c6e, + 0x09f8, 0x09f8, 0x05cb, 0x05cb, 0x0aa7, 0x0aa7, 0x045f, 0x045f, + 0x06cb, 0x06cb, 0x0284, 0x0284, 0x0999, 0x0999, 0x015d, 0x015d, + 0x01a2, 0x01a2, 0x0149, 0x0149, 0x0c65, 0x0c65, 0x0cb6, 0x0cb6, + 0x0331, 0x0331, 0x0449, 0x0449, 0x025b, 0x025b, 0x0262, 0x0262, + 0x052a, 0x052a, 0x07fc, 0x07fc, 0x0748, 0x0748, 0x0180, 0x0180, + 0x0842, 0x0842, 0x0c79, 0x0c79, 0x04c2, 0x04c2, 0x07ca, 0x07ca, + 0x0997, 0x0997, 0x00dc, 0x00dc, 0x085e, 0x085e, 0x0686, 0x0686, + 0x0860, 0x0860, 0x0707, 0x0707, 0x0803, 0x0803, 0x031a, 0x031a, + 0x071b, 0x071b, 0x09ab, 0x09ab, 0x099b, 0x099b, 0x01de, 0x01de, + 0x0c95, 0x0c95, 0x0bcd, 0x0bcd, 0x03e4, 0x03e4, 0x03df, 0x03df, + 0x03be, 0x03be, 0x074d, 0x074d, 0x05f2, 0x05f2, 0x065c, 0x065c, }; static const word16 L_kyber_aarch64_zetas_qinv[] = { - 0xffed, - 0x7b0b, - 0x399a, - 0x314, - 0x34d5, - 0xcf8e, - 0x6e1f, - 0xbeca, - 0xae56, - 0x6c6e, - 0xf129, - 0xc2b6, - 0x29c2, - 0x54f, - 0xd43f, - 0x79bc, - 0xe93d, - 0x43d4, - 0x9908, - 0x8e7f, - 0x15c4, - 0xfbb2, - 0x53bf, - 0x997f, - 0x9258, - 0x5ef9, - 0xd6dc, - 0x2260, - 0x47fb, - 0x229b, - 0x6834, - 0xc0de, - 0xe9c7, - 0xe9c7, - 0xe9c7, - 0xe9c7, - 0xe68c, - 0xe68c, - 0xe68c, - 0xe68c, - 0x5d9, - 0x5d9, - 0x5d9, - 0x5d9, - 0x78f7, - 0x78f7, - 0x78f7, - 0x78f7, - 0xa3f4, - 0xa3f4, - 0xa3f4, - 0xa3f4, - 0x4ed3, - 0x4ed3, - 0x4ed3, - 0x4ed3, - 0x50e7, - 0x50e7, - 0x50e7, - 0x50e7, - 0x61f9, - 0x61f9, - 0x61f9, - 0x61f9, - 0xce04, - 0xce04, - 0xce04, - 0xce04, - 0x67f9, - 0x67f9, - 0x67f9, - 0x67f9, - 0x3ec1, - 0x3ec1, - 0x3ec1, - 0x3ec1, - 0xcf67, - 0xcf67, - 0xcf67, - 0xcf67, - 0x23af, - 0x23af, - 0x23af, - 0x23af, - 0xfd77, - 0xfd77, - 0xfd77, - 0xfd77, - 0x9a7e, - 0x9a7e, - 0x9a7e, - 0x9a7e, - 0x6cbd, - 0x6cbd, - 0x6cbd, - 0x6cbd, - 0x4dac, - 0x4dac, - 0x4dac, - 0x4dac, - 0x91a7, - 0x91a7, - 0x91a7, - 0x91a7, - 0xc1f2, - 0xc1f2, - 0xc1f2, - 0xc1f2, - 0xdd3e, - 0xdd3e, - 0xdd3e, - 0xdd3e, - 0x916b, - 0x916b, - 0x916b, - 0x916b, - 0x2374, - 0x2374, - 0x2374, - 0x2374, - 0x8a0a, - 0x8a0a, - 0x8a0a, - 0x8a0a, - 0x474a, - 0x474a, - 0x474a, - 0x474a, - 0x3473, - 0x3473, - 0x3473, - 0x3473, - 0x36c1, - 0x36c1, - 0x36c1, - 0x36c1, - 0x8e1d, - 0x8e1d, - 0x8e1d, - 0x8e1d, - 0xce2c, - 0xce2c, - 0xce2c, - 0xce2c, - 0x41c0, - 0x41c0, - 0x41c0, - 0x41c0, - 0x10d8, - 0x10d8, - 0x10d8, - 0x10d8, - 0xa1a5, - 0xa1a5, - 0xa1a5, - 0xa1a5, - 0xba06, - 0xba06, - 0xba06, - 0xba06, - 0xfeb2, - 0xfeb2, - 0x2bae, - 0x2bae, - 0xd32b, - 0xd32b, - 0x344b, - 0x344b, - 0x821e, - 0x821e, - 0xc867, - 0xc867, - 0x500e, - 0x500e, - 0xab69, - 0xab69, - 0x93a6, - 0x93a6, - 0x334b, - 0x334b, - 0x3b1, - 0x3b1, - 0xee16, - 0xee16, - 0xc5de, - 0xc5de, - 0x5a35, - 0x5a35, - 0x1826, - 0x1826, - 0x1575, - 0x1575, - 0x7d0b, - 0x7d0b, - 0x810a, - 0x810a, - 0x2987, - 0x2987, - 0x766e, - 0x766e, - 0x71f8, - 0x71f8, - 0xb6cb, - 0xb6cb, - 0x8fa7, - 0x8fa7, - 0x315f, - 0x315f, - 0xb7cb, - 0xb7cb, - 0x4e84, - 0x4e84, - 0x4499, - 0x4499, - 0x485d, - 0x485d, - 0xc7a2, - 0xc7a2, - 0x4c49, - 0x4c49, - 0xeb65, - 0xeb65, - 0xceb6, - 0xceb6, - 0x8631, - 0x8631, - 0x4f49, - 0x4f49, - 0x635b, - 0x635b, - 0x862, - 0x862, - 0xe32a, - 0xe32a, - 0x3bfc, - 0x3bfc, - 0x5f48, - 0x5f48, - 0x8180, - 0x8180, - 0xae42, - 0xae42, - 0xe779, - 0xe779, - 0x2ac2, - 0x2ac2, - 0xc5ca, - 0xc5ca, - 0x5e97, - 0x5e97, - 0xd4dc, - 0xd4dc, - 0x425e, - 0x425e, - 0x3886, - 0x3886, - 0x2860, - 0x2860, - 0xac07, - 0xac07, - 0xe103, - 0xe103, - 0xb11a, - 0xb11a, - 0xa81b, - 0xa81b, - 0x5aab, - 0x5aab, - 0x2a9b, - 0x2a9b, - 0xbbde, - 0xbbde, - 0x7b95, - 0x7b95, - 0xa2cd, - 0xa2cd, - 0x6fe4, - 0x6fe4, - 0xb0df, - 0xb0df, - 0x5dbe, - 0x5dbe, - 0x1e4d, - 0x1e4d, - 0xbbf2, - 0xbbf2, - 0x5a5c, - 0x5a5c, + 0xffed, 0x7b0b, 0x399a, 0x0314, 0x34d5, 0xcf8e, 0x6e1f, 0xbeca, + 0xae56, 0x6c6e, 0xf129, 0xc2b6, 0x29c2, 0x054f, 0xd43f, 0x79bc, + 0xe93d, 0x43d4, 0x9908, 0x8e7f, 0x15c4, 0xfbb2, 0x53bf, 0x997f, + 0x9258, 0x5ef9, 0xd6dc, 0x2260, 0x47fb, 0x229b, 0x6834, 0xc0de, + 0xe9c7, 0xe9c7, 0xe9c7, 0xe9c7, 0xe68c, 0xe68c, 0xe68c, 0xe68c, + 0x05d9, 0x05d9, 0x05d9, 0x05d9, 0x78f7, 0x78f7, 0x78f7, 0x78f7, + 0xa3f4, 0xa3f4, 0xa3f4, 0xa3f4, 0x4ed3, 0x4ed3, 0x4ed3, 0x4ed3, + 0x50e7, 0x50e7, 0x50e7, 0x50e7, 0x61f9, 0x61f9, 0x61f9, 0x61f9, + 0xce04, 0xce04, 0xce04, 0xce04, 0x67f9, 0x67f9, 0x67f9, 0x67f9, + 0x3ec1, 0x3ec1, 0x3ec1, 0x3ec1, 0xcf67, 0xcf67, 0xcf67, 0xcf67, + 0x23af, 0x23af, 0x23af, 0x23af, 0xfd77, 0xfd77, 0xfd77, 0xfd77, + 0x9a7e, 0x9a7e, 0x9a7e, 0x9a7e, 0x6cbd, 0x6cbd, 0x6cbd, 0x6cbd, + 0x4dac, 0x4dac, 0x4dac, 0x4dac, 0x91a7, 0x91a7, 0x91a7, 0x91a7, + 0xc1f2, 0xc1f2, 0xc1f2, 0xc1f2, 0xdd3e, 0xdd3e, 0xdd3e, 0xdd3e, + 0x916b, 0x916b, 0x916b, 0x916b, 0x2374, 0x2374, 0x2374, 0x2374, + 0x8a0a, 0x8a0a, 0x8a0a, 0x8a0a, 0x474a, 0x474a, 0x474a, 0x474a, + 0x3473, 0x3473, 0x3473, 0x3473, 0x36c1, 0x36c1, 0x36c1, 0x36c1, + 0x8e1d, 0x8e1d, 0x8e1d, 0x8e1d, 0xce2c, 0xce2c, 0xce2c, 0xce2c, + 0x41c0, 0x41c0, 0x41c0, 0x41c0, 0x10d8, 0x10d8, 0x10d8, 0x10d8, + 0xa1a5, 0xa1a5, 0xa1a5, 0xa1a5, 0xba06, 0xba06, 0xba06, 0xba06, + 0xfeb2, 0xfeb2, 0x2bae, 0x2bae, 0xd32b, 0xd32b, 0x344b, 0x344b, + 0x821e, 0x821e, 0xc867, 0xc867, 0x500e, 0x500e, 0xab69, 0xab69, + 0x93a6, 0x93a6, 0x334b, 0x334b, 0x03b1, 0x03b1, 0xee16, 0xee16, + 0xc5de, 0xc5de, 0x5a35, 0x5a35, 0x1826, 0x1826, 0x1575, 0x1575, + 0x7d0b, 0x7d0b, 0x810a, 0x810a, 0x2987, 0x2987, 0x766e, 0x766e, + 0x71f8, 0x71f8, 0xb6cb, 0xb6cb, 0x8fa7, 0x8fa7, 0x315f, 0x315f, + 0xb7cb, 0xb7cb, 0x4e84, 0x4e84, 0x4499, 0x4499, 0x485d, 0x485d, + 0xc7a2, 0xc7a2, 0x4c49, 0x4c49, 0xeb65, 0xeb65, 0xceb6, 0xceb6, + 0x8631, 0x8631, 0x4f49, 0x4f49, 0x635b, 0x635b, 0x0862, 0x0862, + 0xe32a, 0xe32a, 0x3bfc, 0x3bfc, 0x5f48, 0x5f48, 0x8180, 0x8180, + 0xae42, 0xae42, 0xe779, 0xe779, 0x2ac2, 0x2ac2, 0xc5ca, 0xc5ca, + 0x5e97, 0x5e97, 0xd4dc, 0xd4dc, 0x425e, 0x425e, 0x3886, 0x3886, + 0x2860, 0x2860, 0xac07, 0xac07, 0xe103, 0xe103, 0xb11a, 0xb11a, + 0xa81b, 0xa81b, 0x5aab, 0x5aab, 0x2a9b, 0x2a9b, 0xbbde, 0xbbde, + 0x7b95, 0x7b95, 0xa2cd, 0xa2cd, 0x6fe4, 0x6fe4, 0xb0df, 0xb0df, + 0x5dbe, 0x5dbe, 0x1e4d, 0x1e4d, 0xbbf2, 0xbbf2, 0x5a5c, 0x5a5c, }; void kyber_ntt(sword16* r) @@ -714,60 +184,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v14.8h, v1.h[1]\n\t" "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v15.8h, v1.h[1]\n\t" "mul v30.8h, v16.8h, v1.h[1]\n\t" "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v17.8h, v1.h[1]\n\t" "mul v30.8h, v18.8h, v1.h[1]\n\t" "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v19.8h, v1.h[1]\n\t" "mul v30.8h, v20.8h, v1.h[1]\n\t" "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v13.8h, v5.8h, v21.8h\n\t" @@ -790,60 +240,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v10.8h, v1.h[2]\n\t" "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v11.8h, v1.h[2]\n\t" "mul v30.8h, v12.8h, v1.h[2]\n\t" "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v17.8h, v1.h[3]\n\t" "mul v30.8h, v18.8h, v1.h[3]\n\t" "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v19.8h, v1.h[3]\n\t" "mul v30.8h, v20.8h, v1.h[3]\n\t" "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v9.8h, v5.8h, v21.8h\n\t" @@ -866,60 +296,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v1.h[4]\n\t" "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v11.8h, v1.h[5]\n\t" "mul v30.8h, v12.8h, v1.h[5]\n\t" "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v15.8h, v1.h[6]\n\t" "mul v30.8h, v16.8h, v1.h[6]\n\t" "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v19.8h, v1.h[7]\n\t" "mul v30.8h, v20.8h, v1.h[7]\n\t" "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v7.8h, v5.8h, v21.8h\n\t" @@ -944,60 +354,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v1.h[1]\n\t" "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v10.8h, v1.h[2]\n\t" "mul v30.8h, v12.8h, v1.h[3]\n\t" "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v14.8h, v1.h[4]\n\t" "mul v30.8h, v16.8h, v1.h[5]\n\t" "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v18.8h, v1.h[6]\n\t" "mul v30.8h, v20.8h, v1.h[7]\n\t" "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -1054,60 +444,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v14.8h, v1.h[1]\n\t" "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v15.8h, v1.h[1]\n\t" "mul v30.8h, v16.8h, v1.h[1]\n\t" "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v17.8h, v1.h[1]\n\t" "mul v30.8h, v18.8h, v1.h[1]\n\t" "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v19.8h, v1.h[1]\n\t" "mul v30.8h, v20.8h, v1.h[1]\n\t" "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v13.8h, v5.8h, v21.8h\n\t" @@ -1130,60 +500,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v10.8h, v1.h[2]\n\t" "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v11.8h, v1.h[2]\n\t" "mul v30.8h, v12.8h, v1.h[2]\n\t" "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v17.8h, v1.h[3]\n\t" "mul v30.8h, v18.8h, v1.h[3]\n\t" "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v19.8h, v1.h[3]\n\t" "mul v30.8h, v20.8h, v1.h[3]\n\t" "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v9.8h, v5.8h, v21.8h\n\t" @@ -1206,60 +556,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v1.h[4]\n\t" "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v11.8h, v1.h[5]\n\t" "mul v30.8h, v12.8h, v1.h[5]\n\t" "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v15.8h, v1.h[6]\n\t" "mul v30.8h, v16.8h, v1.h[6]\n\t" "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v19.8h, v1.h[7]\n\t" "mul v30.8h, v20.8h, v1.h[7]\n\t" "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v7.8h, v5.8h, v21.8h\n\t" @@ -1284,60 +614,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v1.h[1]\n\t" "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v10.8h, v1.h[2]\n\t" "mul v30.8h, v12.8h, v1.h[3]\n\t" "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v14.8h, v1.h[4]\n\t" "mul v30.8h, v16.8h, v1.h[5]\n\t" "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v18.8h, v1.h[6]\n\t" "mul v30.8h, v20.8h, v1.h[7]\n\t" "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -1386,60 +696,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v1.h[1]\n\t" "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v10.8h, v1.h[2]\n\t" "mul v30.8h, v12.8h, v1.h[3]\n\t" "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v14.8h, v1.h[4]\n\t" "mul v30.8h, v16.8h, v1.h[5]\n\t" "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v18.8h, v1.h[6]\n\t" "mul v30.8h, v20.8h, v1.h[7]\n\t" "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -1472,15 +762,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v3.8h\n\t" "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "ldr q0, [x2, #96]\n\t" @@ -1497,15 +782,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v12.8h, v3.8h\n\t" "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #128]\n\t" @@ -1522,15 +802,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v16.8h, v3.8h\n\t" "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "ldr q0, [x2, #160]\n\t" @@ -1547,15 +822,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v20.8h, v3.8h\n\t" "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -1588,15 +858,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v3.8h\n\t" "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "ldr q0, [x2, #352]\n\t" @@ -1613,15 +878,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v12.8h, v3.8h\n\t" "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #384]\n\t" @@ -1638,15 +898,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v16.8h, v3.8h\n\t" "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "ldr q0, [x2, #416]\n\t" @@ -1663,15 +918,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v20.8h, v3.8h\n\t" "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -1808,60 +1058,40 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v1.h[1]\n\t" "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v29.8h, v10.8h, v1.h[2]\n\t" "mul v30.8h, v12.8h, v1.h[3]\n\t" "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v29.8h, v14.8h, v1.h[4]\n\t" "mul v30.8h, v16.8h, v1.h[5]\n\t" "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "mul v29.8h, v18.8h, v1.h[6]\n\t" "mul v30.8h, v20.8h, v1.h[7]\n\t" "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -1894,15 +1124,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v3.8h\n\t" "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "ldr q0, [x2, #224]\n\t" @@ -1919,15 +1144,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v12.8h, v3.8h\n\t" "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #256]\n\t" @@ -1944,15 +1164,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v16.8h, v3.8h\n\t" "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "ldr q0, [x2, #288]\n\t" @@ -1969,15 +1184,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v20.8h, v3.8h\n\t" "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -2010,15 +1220,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v8.8h, v3.8h\n\t" "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v21.8h, v21.8h, v29.8h\n\t" "sub v22.8h, v22.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "ldr q0, [x2, #480]\n\t" @@ -2035,15 +1240,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v12.8h, v3.8h\n\t" "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v23.8h, v23.8h, v29.8h\n\t" "sub v24.8h, v24.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #512]\n\t" @@ -2060,15 +1260,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v16.8h, v3.8h\n\t" "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v25.8h, v25.8h, v29.8h\n\t" "sub v26.8h, v26.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v25.8h, v25.8h, #1\n\t" "sshr v26.8h, v26.8h, #1\n\t" "ldr q0, [x2, #544]\n\t" @@ -2085,15 +1280,10 @@ void kyber_ntt(sword16* r) "mul v30.8h, v20.8h, v3.8h\n\t" "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" - "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" -#else "sqrdmulh v29.8h, v29.8h, v4.h[0]\n\t" "sqrdmulh v30.8h, v30.8h, v4.h[0]\n\t" "sub v27.8h, v27.8h, v29.8h\n\t" "sub v28.8h, v28.8h, v30.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v27.8h, v27.8h, #1\n\t" "sshr v28.8h, v28.8h, #1\n\t" "sub v6.8h, v5.8h, v21.8h\n\t" @@ -2223,585 +1413,81 @@ void kyber_ntt(sword16* r) } static const word16 L_kyber_aarch64_zetas_inv[] = { - 0x6a5, - 0x6a5, - 0x70f, - 0x70f, - 0x5b4, - 0x5b4, - 0x943, - 0x943, - 0x922, - 0x922, - 0x91d, - 0x91d, - 0x134, - 0x134, - 0x6c, - 0x6c, - 0xb23, - 0xb23, - 0x366, - 0x366, - 0x356, - 0x356, - 0x5e6, - 0x5e6, - 0x9e7, - 0x9e7, - 0x4fe, - 0x4fe, - 0x5fa, - 0x5fa, - 0x4a1, - 0x4a1, - 0x67b, - 0x67b, - 0x4a3, - 0x4a3, - 0xc25, - 0xc25, - 0x36a, - 0x36a, - 0x537, - 0x537, - 0x83f, - 0x83f, - 0x88, - 0x88, - 0x4bf, - 0x4bf, - 0xb81, - 0xb81, - 0x5b9, - 0x5b9, - 0x505, - 0x505, - 0x7d7, - 0x7d7, - 0xa9f, - 0xa9f, - 0xaa6, - 0xaa6, - 0x8b8, - 0x8b8, - 0x9d0, - 0x9d0, - 0x4b, - 0x4b, - 0x9c, - 0x9c, - 0xbb8, - 0xbb8, - 0xb5f, - 0xb5f, - 0xba4, - 0xba4, - 0x368, - 0x368, - 0xa7d, - 0xa7d, - 0x636, - 0x636, - 0x8a2, - 0x8a2, - 0x25a, - 0x25a, - 0x736, - 0x736, - 0x309, - 0x309, - 0x93, - 0x93, - 0x87a, - 0x87a, - 0x9f7, - 0x9f7, - 0xf6, - 0xf6, - 0x68c, - 0x68c, - 0x6db, - 0x6db, - 0x1cc, - 0x1cc, - 0x123, - 0x123, - 0xeb, - 0xeb, - 0xc50, - 0xc50, - 0xab6, - 0xab6, - 0xb5b, - 0xb5b, - 0xc98, - 0xc98, - 0x6f3, - 0x6f3, - 0x99a, - 0x99a, - 0x4e3, - 0x4e3, - 0x9b6, - 0x9b6, - 0xad6, - 0xad6, - 0xb53, - 0xb53, - 0x44f, - 0x44f, - 0x4fb, - 0x4fb, - 0x4fb, - 0x4fb, - 0xa5c, - 0xa5c, - 0xa5c, - 0xa5c, - 0x429, - 0x429, - 0x429, - 0x429, - 0xb41, - 0xb41, - 0xb41, - 0xb41, - 0x2d5, - 0x2d5, - 0x2d5, - 0x2d5, - 0x5e4, - 0x5e4, - 0x5e4, - 0x5e4, - 0x940, - 0x940, - 0x940, - 0x940, - 0x18e, - 0x18e, - 0x18e, - 0x18e, - 0x3b7, - 0x3b7, - 0x3b7, - 0x3b7, - 0xf7, - 0xf7, - 0xf7, - 0xf7, - 0x58d, - 0x58d, - 0x58d, - 0x58d, - 0xc96, - 0xc96, - 0xc96, - 0xc96, - 0x9c3, - 0x9c3, - 0x9c3, - 0x9c3, - 0x10f, - 0x10f, - 0x10f, - 0x10f, - 0x5a, - 0x5a, - 0x5a, - 0x5a, - 0x355, - 0x355, - 0x355, - 0x355, - 0x744, - 0x744, - 0x744, - 0x744, - 0xc83, - 0xc83, - 0xc83, - 0xc83, - 0x48a, - 0x48a, - 0x48a, - 0x48a, - 0x652, - 0x652, - 0x652, - 0x652, - 0x29a, - 0x29a, - 0x29a, - 0x29a, - 0x140, - 0x140, - 0x140, - 0x140, - 0x8, - 0x8, - 0x8, - 0x8, - 0xafd, - 0xafd, - 0xafd, - 0xafd, - 0x608, - 0x608, - 0x608, - 0x608, - 0x11a, - 0x11a, - 0x11a, - 0x11a, - 0x72e, - 0x72e, - 0x72e, - 0x72e, - 0x50d, - 0x50d, - 0x50d, - 0x50d, - 0x90a, - 0x90a, - 0x90a, - 0x90a, - 0x228, - 0x228, - 0x228, - 0x228, - 0xa75, - 0xa75, - 0xa75, - 0xa75, - 0x83a, - 0x83a, - 0x83a, - 0x83a, - 0x623, - 0xcd, - 0xb66, - 0x606, - 0xaa1, - 0xa25, - 0x908, - 0x2a9, - 0x82, - 0x642, - 0x74f, - 0x33d, - 0xb82, - 0xbf9, - 0x52d, - 0xac4, - 0x745, - 0x5c2, - 0x4b2, - 0x93f, - 0xc4b, - 0x6d8, - 0xa93, - 0xab, - 0xc37, - 0xbe2, - 0x773, - 0x72c, - 0x5ed, - 0x167, - 0x2f6, - 0x5a1, + 0x06a5, 0x06a5, 0x070f, 0x070f, 0x05b4, 0x05b4, 0x0943, 0x0943, + 0x0922, 0x0922, 0x091d, 0x091d, 0x0134, 0x0134, 0x006c, 0x006c, + 0x0b23, 0x0b23, 0x0366, 0x0366, 0x0356, 0x0356, 0x05e6, 0x05e6, + 0x09e7, 0x09e7, 0x04fe, 0x04fe, 0x05fa, 0x05fa, 0x04a1, 0x04a1, + 0x067b, 0x067b, 0x04a3, 0x04a3, 0x0c25, 0x0c25, 0x036a, 0x036a, + 0x0537, 0x0537, 0x083f, 0x083f, 0x0088, 0x0088, 0x04bf, 0x04bf, + 0x0b81, 0x0b81, 0x05b9, 0x05b9, 0x0505, 0x0505, 0x07d7, 0x07d7, + 0x0a9f, 0x0a9f, 0x0aa6, 0x0aa6, 0x08b8, 0x08b8, 0x09d0, 0x09d0, + 0x004b, 0x004b, 0x009c, 0x009c, 0x0bb8, 0x0bb8, 0x0b5f, 0x0b5f, + 0x0ba4, 0x0ba4, 0x0368, 0x0368, 0x0a7d, 0x0a7d, 0x0636, 0x0636, + 0x08a2, 0x08a2, 0x025a, 0x025a, 0x0736, 0x0736, 0x0309, 0x0309, + 0x0093, 0x0093, 0x087a, 0x087a, 0x09f7, 0x09f7, 0x00f6, 0x00f6, + 0x068c, 0x068c, 0x06db, 0x06db, 0x01cc, 0x01cc, 0x0123, 0x0123, + 0x00eb, 0x00eb, 0x0c50, 0x0c50, 0x0ab6, 0x0ab6, 0x0b5b, 0x0b5b, + 0x0c98, 0x0c98, 0x06f3, 0x06f3, 0x099a, 0x099a, 0x04e3, 0x04e3, + 0x09b6, 0x09b6, 0x0ad6, 0x0ad6, 0x0b53, 0x0b53, 0x044f, 0x044f, + 0x04fb, 0x04fb, 0x04fb, 0x04fb, 0x0a5c, 0x0a5c, 0x0a5c, 0x0a5c, + 0x0429, 0x0429, 0x0429, 0x0429, 0x0b41, 0x0b41, 0x0b41, 0x0b41, + 0x02d5, 0x02d5, 0x02d5, 0x02d5, 0x05e4, 0x05e4, 0x05e4, 0x05e4, + 0x0940, 0x0940, 0x0940, 0x0940, 0x018e, 0x018e, 0x018e, 0x018e, + 0x03b7, 0x03b7, 0x03b7, 0x03b7, 0x00f7, 0x00f7, 0x00f7, 0x00f7, + 0x058d, 0x058d, 0x058d, 0x058d, 0x0c96, 0x0c96, 0x0c96, 0x0c96, + 0x09c3, 0x09c3, 0x09c3, 0x09c3, 0x010f, 0x010f, 0x010f, 0x010f, + 0x005a, 0x005a, 0x005a, 0x005a, 0x0355, 0x0355, 0x0355, 0x0355, + 0x0744, 0x0744, 0x0744, 0x0744, 0x0c83, 0x0c83, 0x0c83, 0x0c83, + 0x048a, 0x048a, 0x048a, 0x048a, 0x0652, 0x0652, 0x0652, 0x0652, + 0x029a, 0x029a, 0x029a, 0x029a, 0x0140, 0x0140, 0x0140, 0x0140, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0afd, 0x0afd, 0x0afd, 0x0afd, + 0x0608, 0x0608, 0x0608, 0x0608, 0x011a, 0x011a, 0x011a, 0x011a, + 0x072e, 0x072e, 0x072e, 0x072e, 0x050d, 0x050d, 0x050d, 0x050d, + 0x090a, 0x090a, 0x090a, 0x090a, 0x0228, 0x0228, 0x0228, 0x0228, + 0x0a75, 0x0a75, 0x0a75, 0x0a75, 0x083a, 0x083a, 0x083a, 0x083a, + 0x0623, 0x00cd, 0x0b66, 0x0606, 0x0aa1, 0x0a25, 0x0908, 0x02a9, + 0x0082, 0x0642, 0x074f, 0x033d, 0x0b82, 0x0bf9, 0x052d, 0x0ac4, + 0x0745, 0x05c2, 0x04b2, 0x093f, 0x0c4b, 0x06d8, 0x0a93, 0x00ab, + 0x0c37, 0x0be2, 0x0773, 0x072c, 0x05ed, 0x0167, 0x02f6, 0x05a1, }; static const word16 L_kyber_aarch64_zetas_inv_qinv[] = { - 0xa5a5, - 0xa5a5, - 0x440f, - 0x440f, - 0xe1b4, - 0xe1b4, - 0xa243, - 0xa243, - 0x4f22, - 0x4f22, - 0x901d, - 0x901d, - 0x5d34, - 0x5d34, - 0x846c, - 0x846c, - 0x4423, - 0x4423, - 0xd566, - 0xd566, - 0xa556, - 0xa556, - 0x57e6, - 0x57e6, - 0x4ee7, - 0x4ee7, - 0x1efe, - 0x1efe, - 0x53fa, - 0x53fa, - 0xd7a1, - 0xd7a1, - 0xc77b, - 0xc77b, - 0xbda3, - 0xbda3, - 0x2b25, - 0x2b25, - 0xa16a, - 0xa16a, - 0x3a37, - 0x3a37, - 0xd53f, - 0xd53f, - 0x1888, - 0x1888, - 0x51bf, - 0x51bf, - 0x7e81, - 0x7e81, - 0xa0b9, - 0xa0b9, - 0xc405, - 0xc405, - 0x1cd7, - 0x1cd7, - 0xf79f, - 0xf79f, - 0x9ca6, - 0x9ca6, - 0xb0b8, - 0xb0b8, - 0x79d0, - 0x79d0, - 0x314b, - 0x314b, - 0x149c, - 0x149c, - 0xb3b8, - 0xb3b8, - 0x385f, - 0x385f, - 0xb7a4, - 0xb7a4, - 0xbb68, - 0xbb68, - 0xb17d, - 0xb17d, - 0x4836, - 0x4836, - 0xcea2, - 0xcea2, - 0x705a, - 0x705a, - 0x4936, - 0x4936, - 0x8e09, - 0x8e09, - 0x8993, - 0x8993, - 0xd67a, - 0xd67a, - 0x7ef7, - 0x7ef7, - 0x82f6, - 0x82f6, - 0xea8c, - 0xea8c, - 0xe7db, - 0xe7db, - 0xa5cc, - 0xa5cc, - 0x3a23, - 0x3a23, - 0x11eb, - 0x11eb, - 0xfc50, - 0xfc50, - 0xccb6, - 0xccb6, - 0x6c5b, - 0x6c5b, - 0x5498, - 0x5498, - 0xaff3, - 0xaff3, - 0x379a, - 0x379a, - 0x7de3, - 0x7de3, - 0xcbb6, - 0xcbb6, - 0x2cd6, - 0x2cd6, - 0xd453, - 0xd453, - 0x14f, - 0x14f, - 0x45fb, - 0x45fb, - 0x45fb, - 0x45fb, - 0x5e5c, - 0x5e5c, - 0x5e5c, - 0x5e5c, - 0xef29, - 0xef29, - 0xef29, - 0xef29, - 0xbe41, - 0xbe41, - 0xbe41, - 0xbe41, - 0x31d5, - 0x31d5, - 0x31d5, - 0x31d5, - 0x71e4, - 0x71e4, - 0x71e4, - 0x71e4, - 0xc940, - 0xc940, - 0xc940, - 0xc940, - 0xcb8e, - 0xcb8e, - 0xcb8e, - 0xcb8e, - 0xb8b7, - 0xb8b7, - 0xb8b7, - 0xb8b7, - 0x75f7, - 0x75f7, - 0x75f7, - 0x75f7, - 0xdc8d, - 0xdc8d, - 0xdc8d, - 0xdc8d, - 0x6e96, - 0x6e96, - 0x6e96, - 0x6e96, - 0x22c3, - 0x22c3, - 0x22c3, - 0x22c3, - 0x3e0f, - 0x3e0f, - 0x3e0f, - 0x3e0f, - 0x6e5a, - 0x6e5a, - 0x6e5a, - 0x6e5a, - 0xb255, - 0xb255, - 0xb255, - 0xb255, - 0x9344, - 0x9344, - 0x9344, - 0x9344, - 0x6583, - 0x6583, - 0x6583, - 0x6583, - 0x28a, - 0x28a, - 0x28a, - 0x28a, - 0xdc52, - 0xdc52, - 0xdc52, - 0xdc52, - 0x309a, - 0x309a, - 0x309a, - 0x309a, - 0xc140, - 0xc140, - 0xc140, - 0xc140, - 0x9808, - 0x9808, - 0x9808, - 0x9808, - 0x31fd, - 0x31fd, - 0x31fd, - 0x31fd, - 0x9e08, - 0x9e08, - 0x9e08, - 0x9e08, - 0xaf1a, - 0xaf1a, - 0xaf1a, - 0xaf1a, - 0xb12e, - 0xb12e, - 0xb12e, - 0xb12e, - 0x5c0d, - 0x5c0d, - 0x5c0d, - 0x5c0d, - 0x870a, - 0x870a, - 0x870a, - 0x870a, - 0xfa28, - 0xfa28, - 0xfa28, - 0xfa28, - 0x1975, - 0x1975, - 0x1975, - 0x1975, - 0x163a, - 0x163a, - 0x163a, - 0x163a, - 0x3f23, - 0x97cd, - 0xdd66, - 0xb806, - 0xdda1, - 0x2925, - 0xa108, - 0x6da9, - 0x6682, - 0xac42, - 0x44f, - 0xea3d, - 0x7182, - 0x66f9, - 0xbc2d, - 0x16c4, - 0x8645, - 0x2bc2, - 0xfab2, - 0xd63f, - 0x3d4b, - 0xed8, - 0x9393, - 0x51ab, - 0x4137, - 0x91e2, - 0x3073, - 0xcb2c, - 0xfced, - 0xc667, - 0x84f6, - 0xd8a1, + 0xa5a5, 0xa5a5, 0x440f, 0x440f, 0xe1b4, 0xe1b4, 0xa243, 0xa243, + 0x4f22, 0x4f22, 0x901d, 0x901d, 0x5d34, 0x5d34, 0x846c, 0x846c, + 0x4423, 0x4423, 0xd566, 0xd566, 0xa556, 0xa556, 0x57e6, 0x57e6, + 0x4ee7, 0x4ee7, 0x1efe, 0x1efe, 0x53fa, 0x53fa, 0xd7a1, 0xd7a1, + 0xc77b, 0xc77b, 0xbda3, 0xbda3, 0x2b25, 0x2b25, 0xa16a, 0xa16a, + 0x3a37, 0x3a37, 0xd53f, 0xd53f, 0x1888, 0x1888, 0x51bf, 0x51bf, + 0x7e81, 0x7e81, 0xa0b9, 0xa0b9, 0xc405, 0xc405, 0x1cd7, 0x1cd7, + 0xf79f, 0xf79f, 0x9ca6, 0x9ca6, 0xb0b8, 0xb0b8, 0x79d0, 0x79d0, + 0x314b, 0x314b, 0x149c, 0x149c, 0xb3b8, 0xb3b8, 0x385f, 0x385f, + 0xb7a4, 0xb7a4, 0xbb68, 0xbb68, 0xb17d, 0xb17d, 0x4836, 0x4836, + 0xcea2, 0xcea2, 0x705a, 0x705a, 0x4936, 0x4936, 0x8e09, 0x8e09, + 0x8993, 0x8993, 0xd67a, 0xd67a, 0x7ef7, 0x7ef7, 0x82f6, 0x82f6, + 0xea8c, 0xea8c, 0xe7db, 0xe7db, 0xa5cc, 0xa5cc, 0x3a23, 0x3a23, + 0x11eb, 0x11eb, 0xfc50, 0xfc50, 0xccb6, 0xccb6, 0x6c5b, 0x6c5b, + 0x5498, 0x5498, 0xaff3, 0xaff3, 0x379a, 0x379a, 0x7de3, 0x7de3, + 0xcbb6, 0xcbb6, 0x2cd6, 0x2cd6, 0xd453, 0xd453, 0x014f, 0x014f, + 0x45fb, 0x45fb, 0x45fb, 0x45fb, 0x5e5c, 0x5e5c, 0x5e5c, 0x5e5c, + 0xef29, 0xef29, 0xef29, 0xef29, 0xbe41, 0xbe41, 0xbe41, 0xbe41, + 0x31d5, 0x31d5, 0x31d5, 0x31d5, 0x71e4, 0x71e4, 0x71e4, 0x71e4, + 0xc940, 0xc940, 0xc940, 0xc940, 0xcb8e, 0xcb8e, 0xcb8e, 0xcb8e, + 0xb8b7, 0xb8b7, 0xb8b7, 0xb8b7, 0x75f7, 0x75f7, 0x75f7, 0x75f7, + 0xdc8d, 0xdc8d, 0xdc8d, 0xdc8d, 0x6e96, 0x6e96, 0x6e96, 0x6e96, + 0x22c3, 0x22c3, 0x22c3, 0x22c3, 0x3e0f, 0x3e0f, 0x3e0f, 0x3e0f, + 0x6e5a, 0x6e5a, 0x6e5a, 0x6e5a, 0xb255, 0xb255, 0xb255, 0xb255, + 0x9344, 0x9344, 0x9344, 0x9344, 0x6583, 0x6583, 0x6583, 0x6583, + 0x028a, 0x028a, 0x028a, 0x028a, 0xdc52, 0xdc52, 0xdc52, 0xdc52, + 0x309a, 0x309a, 0x309a, 0x309a, 0xc140, 0xc140, 0xc140, 0xc140, + 0x9808, 0x9808, 0x9808, 0x9808, 0x31fd, 0x31fd, 0x31fd, 0x31fd, + 0x9e08, 0x9e08, 0x9e08, 0x9e08, 0xaf1a, 0xaf1a, 0xaf1a, 0xaf1a, + 0xb12e, 0xb12e, 0xb12e, 0xb12e, 0x5c0d, 0x5c0d, 0x5c0d, 0x5c0d, + 0x870a, 0x870a, 0x870a, 0x870a, 0xfa28, 0xfa28, 0xfa28, 0xfa28, + 0x1975, 0x1975, 0x1975, 0x1975, 0x163a, 0x163a, 0x163a, 0x163a, + 0x3f23, 0x97cd, 0xdd66, 0xb806, 0xdda1, 0x2925, 0xa108, 0x6da9, + 0x6682, 0xac42, 0x044f, 0xea3d, 0x7182, 0x66f9, 0xbc2d, 0x16c4, + 0x8645, 0x2bc2, 0xfab2, 0xd63f, 0x3d4b, 0x0ed8, 0x9393, 0x51ab, + 0x4137, 0x91e2, 0x3073, 0xcb2c, 0xfced, 0xc667, 0x84f6, 0xd8a1, }; void kyber_invntt(sword16* r) @@ -2898,15 +1584,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "ldr q0, [x2, #32]\n\t" @@ -2921,15 +1602,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "ldr q0, [x2, #64]\n\t" @@ -2944,15 +1620,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "ldr q0, [x2, #96]\n\t" @@ -2967,15 +1638,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #256]\n\t" @@ -2996,15 +1662,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "ldr q0, [x2, #288]\n\t" @@ -3025,15 +1686,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "ldr q0, [x2, #320]\n\t" @@ -3054,15 +1710,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "ldr q0, [x2, #352]\n\t" @@ -3083,15 +1734,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #512]\n\t" @@ -3110,15 +1756,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[1]\n\t" "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "mov v25.16b, v13.16b\n\t" @@ -3135,15 +1776,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[3]\n\t" "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "mov v25.16b, v17.16b\n\t" @@ -3160,15 +1796,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[5]\n\t" "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "mov v25.16b, v21.16b\n\t" @@ -3185,15 +1816,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[7]\n\t" "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" @@ -3296,15 +1922,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "ldr q0, [x2, #160]\n\t" @@ -3319,15 +1940,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "ldr q0, [x2, #192]\n\t" @@ -3342,15 +1958,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "ldr q0, [x2, #224]\n\t" @@ -3365,15 +1976,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #384]\n\t" @@ -3394,15 +2000,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "ldr q0, [x2, #416]\n\t" @@ -3423,15 +2024,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "ldr q0, [x2, #448]\n\t" @@ -3452,15 +2048,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "ldr q0, [x2, #480]\n\t" @@ -3481,15 +2072,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v3.8h\n\t" "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "ldr q0, [x2, #528]\n\t" @@ -3508,15 +2094,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[1]\n\t" "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "mov v25.16b, v13.16b\n\t" @@ -3533,15 +2114,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[3]\n\t" "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "mov v25.16b, v17.16b\n\t" @@ -3558,15 +2134,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[5]\n\t" "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "mov v25.16b, v21.16b\n\t" @@ -3583,15 +2154,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v2.h[7]\n\t" "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" @@ -3654,15 +2220,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[1]\n\t" "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "sub v26.8h, v13.8h, v14.8h\n\t" @@ -3673,15 +2234,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[3]\n\t" "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "sub v26.8h, v17.8h, v18.8h\n\t" @@ -3692,15 +2248,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[5]\n\t" "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "sub v26.8h, v21.8h, v22.8h\n\t" @@ -3711,15 +2262,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[7]\n\t" "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sub v26.8h, v9.8h, v11.8h\n\t" @@ -3730,15 +2276,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[0]\n\t" "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v11.8h, v11.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v11.8h, v11.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "sub v26.8h, v13.8h, v15.8h\n\t" @@ -3749,15 +2290,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[1]\n\t" "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v15.8h, v15.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "sub v26.8h, v17.8h, v19.8h\n\t" @@ -3768,15 +2304,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[2]\n\t" "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v19.8h, v19.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v19.8h, v19.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "sub v26.8h, v21.8h, v23.8h\n\t" @@ -3787,15 +2318,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[3]\n\t" "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v23.8h, v23.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sub v26.8h, v9.8h, v13.8h\n\t" @@ -3806,15 +2332,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[4]\n\t" "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v13.8h, v13.8h, v25.8h\n\t" "sub v14.8h, v14.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v13.8h, v13.8h, #1\n\t" "sshr v14.8h, v14.8h, #1\n\t" "sub v26.8h, v11.8h, v15.8h\n\t" @@ -3825,15 +2346,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[4]\n\t" "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v15.8h, v15.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "sub v26.8h, v17.8h, v21.8h\n\t" @@ -3844,15 +2360,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[5]\n\t" "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v21.8h, v21.8h, v25.8h\n\t" "sub v22.8h, v22.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "sub v26.8h, v19.8h, v23.8h\n\t" @@ -3863,15 +2374,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[5]\n\t" "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v23.8h, v23.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" @@ -3906,15 +2412,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v17.8h, v17.8h, v25.8h\n\t" "sub v18.8h, v18.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v17.8h, v17.8h, #1\n\t" "sshr v18.8h, v18.8h, #1\n\t" "sub v26.8h, v11.8h, v19.8h\n\t" @@ -3925,15 +2426,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v19.8h, v19.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v19.8h, v19.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "sub v26.8h, v13.8h, v21.8h\n\t" @@ -3944,15 +2440,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v21.8h, v21.8h, v25.8h\n\t" "sub v22.8h, v22.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "sub v26.8h, v15.8h, v23.8h\n\t" @@ -3963,135 +2454,90 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v23.8h, v23.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v25.8h, v9.8h, v7.h[7]\n\t" "mul v26.8h, v10.8h, v7.h[7]\n\t" "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v9.8h, v9.8h, v25.8h\n\t" "sub v10.8h, v10.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v9.8h, v9.8h, #1\n\t" "sshr v10.8h, v10.8h, #1\n\t" "mul v25.8h, v11.8h, v7.h[7]\n\t" "mul v26.8h, v12.8h, v7.h[7]\n\t" "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v11.8h, v11.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v11.8h, v11.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "mul v25.8h, v13.8h, v7.h[7]\n\t" "mul v26.8h, v14.8h, v7.h[7]\n\t" "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v13.8h, v13.8h, v25.8h\n\t" "sub v14.8h, v14.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v13.8h, v13.8h, #1\n\t" "sshr v14.8h, v14.8h, #1\n\t" "mul v25.8h, v15.8h, v7.h[7]\n\t" "mul v26.8h, v16.8h, v7.h[7]\n\t" "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v15.8h, v15.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "mul v25.8h, v17.8h, v7.h[7]\n\t" "mul v26.8h, v18.8h, v7.h[7]\n\t" "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v17.8h, v17.8h, v25.8h\n\t" "sub v18.8h, v18.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v17.8h, v17.8h, #1\n\t" "sshr v18.8h, v18.8h, #1\n\t" "mul v25.8h, v19.8h, v7.h[7]\n\t" "mul v26.8h, v20.8h, v7.h[7]\n\t" "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v19.8h, v19.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v19.8h, v19.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "mul v25.8h, v21.8h, v7.h[7]\n\t" "mul v26.8h, v22.8h, v7.h[7]\n\t" "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v21.8h, v21.8h, v25.8h\n\t" "sub v22.8h, v22.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v25.8h, v23.8h, v7.h[7]\n\t" "mul v26.8h, v24.8h, v7.h[7]\n\t" "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" "sub v23.8h, v23.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "str q9, [%x[r]]\n\t" @@ -4134,15 +2580,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[1]\n\t" "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v10.8h, v10.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v10.8h, v10.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "sub v26.8h, v13.8h, v14.8h\n\t" @@ -4153,15 +2594,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[3]\n\t" "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v14.8h, v14.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v14.8h, v14.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "sub v26.8h, v17.8h, v18.8h\n\t" @@ -4172,15 +2608,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[5]\n\t" "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v18.8h, v18.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v18.8h, v18.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "sub v26.8h, v21.8h, v22.8h\n\t" @@ -4191,15 +2622,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v6.h[7]\n\t" "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v22.8h, v22.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v22.8h, v22.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sub v26.8h, v9.8h, v11.8h\n\t" @@ -4210,15 +2636,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[0]\n\t" "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v11.8h, v11.8h, v25.8h\n\t" "sub v12.8h, v12.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v11.8h, v11.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "sub v26.8h, v13.8h, v15.8h\n\t" @@ -4229,15 +2650,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[1]\n\t" "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v15.8h, v15.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "sub v26.8h, v17.8h, v19.8h\n\t" @@ -4248,15 +2664,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[2]\n\t" "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v19.8h, v19.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v19.8h, v19.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "sub v26.8h, v21.8h, v23.8h\n\t" @@ -4267,15 +2678,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[3]\n\t" "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v23.8h, v23.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sub v26.8h, v9.8h, v13.8h\n\t" @@ -4286,15 +2692,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[4]\n\t" "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v13.8h, v13.8h, v25.8h\n\t" "sub v14.8h, v14.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v13.8h, v13.8h, #1\n\t" "sshr v14.8h, v14.8h, #1\n\t" "sub v26.8h, v11.8h, v15.8h\n\t" @@ -4305,15 +2706,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[4]\n\t" "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v15.8h, v15.8h, v25.8h\n\t" "sub v16.8h, v16.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "sub v26.8h, v17.8h, v21.8h\n\t" @@ -4324,15 +2720,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[5]\n\t" "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v21.8h, v21.8h, v25.8h\n\t" "sub v22.8h, v22.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "sub v26.8h, v19.8h, v23.8h\n\t" @@ -4343,15 +2734,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[5]\n\t" "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v23.8h, v23.8h, v25.8h\n\t" "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" @@ -4386,15 +2772,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v17.8h, v17.8h, v25.8h\n\t" "sub v18.8h, v18.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v17.8h, v17.8h, #1\n\t" "sshr v18.8h, v18.8h, #1\n\t" "sub v26.8h, v11.8h, v19.8h\n\t" @@ -4405,15 +2786,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v19.8h, v19.8h, v25.8h\n\t" "sub v20.8h, v20.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v19.8h, v19.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "sub v26.8h, v13.8h, v21.8h\n\t" @@ -4424,15 +2800,10 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" - "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" -#else "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" "sub v21.8h, v21.8h, v25.8h\n\t" "sub v22.8h, v22.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "sub v26.8h, v15.8h, v23.8h\n\t" @@ -4443,135 +2814,2543 @@ void kyber_invntt(sword16* r) "mul v27.8h, v28.8h, v7.h[6]\n\t" "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v27.8h\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v25.8h, v9.8h, v7.h[7]\n\t" + "mul v26.8h, v10.8h, v7.h[7]\n\t" + "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" + "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v9.8h, v9.8h, v25.8h\n\t" + "sub v10.8h, v10.8h, v26.8h\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v25.8h, v11.8h, v7.h[7]\n\t" + "mul v26.8h, v12.8h, v7.h[7]\n\t" + "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" + "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v11.8h, v11.8h, v25.8h\n\t" + "sub v12.8h, v12.8h, v26.8h\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v25.8h, v13.8h, v7.h[7]\n\t" + "mul v26.8h, v14.8h, v7.h[7]\n\t" + "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" + "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v13.8h, v13.8h, v25.8h\n\t" + "sub v14.8h, v14.8h, v26.8h\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v25.8h, v15.8h, v7.h[7]\n\t" + "mul v26.8h, v16.8h, v7.h[7]\n\t" + "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" + "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v15.8h, v15.8h, v25.8h\n\t" + "sub v16.8h, v16.8h, v26.8h\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mul v25.8h, v17.8h, v7.h[7]\n\t" + "mul v26.8h, v18.8h, v7.h[7]\n\t" + "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" + "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v17.8h, v17.8h, v25.8h\n\t" + "sub v18.8h, v18.8h, v26.8h\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "mul v25.8h, v19.8h, v7.h[7]\n\t" + "mul v26.8h, v20.8h, v7.h[7]\n\t" + "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" + "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v19.8h, v19.8h, v25.8h\n\t" + "sub v20.8h, v20.8h, v26.8h\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mul v25.8h, v21.8h, v7.h[7]\n\t" + "mul v26.8h, v22.8h, v7.h[7]\n\t" + "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" + "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v21.8h, v21.8h, v25.8h\n\t" + "sub v22.8h, v22.8h, v26.8h\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v25.8h, v23.8h, v7.h[7]\n\t" + "mul v26.8h, v24.8h, v7.h[7]\n\t" + "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" + "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" + "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" + "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" + "sub v23.8h, v23.8h, v25.8h\n\t" + "sub v24.8h, v24.8h, v26.8h\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "str q9, [%x[r], #16]\n\t" + "str q10, [%x[r], #48]\n\t" + "str q11, [%x[r], #80]\n\t" + "str q12, [%x[r], #112]\n\t" + "str q13, [%x[r], #144]\n\t" + "str q14, [%x[r], #176]\n\t" + "str q15, [%x[r], #208]\n\t" + "str q16, [%x[r], #240]\n\t" + "str q17, [x1, #16]\n\t" + "str q18, [x1, #48]\n\t" + "str q19, [x1, #80]\n\t" + "str q20, [x1, #112]\n\t" + "str q21, [x1, #144]\n\t" + "str q22, [x1, #176]\n\t" + "str q23, [x1, #208]\n\t" + "str q24, [x1, #240]\n\t" + : [r] "+r" (r) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv) + : "memory", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "cc" + ); +} + +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH +void kyber_ntt_sqrdmlsh(sword16* r) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_zetas]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_zetas]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_zetas]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_zetas]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_qinv]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_qinv]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_qinv]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_qinv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "add x1, %x[r], #0x100\n\t" + "ldr q4, [x4]\n\t" + "ldr q5, [%x[r]]\n\t" + "ldr q6, [%x[r], #32]\n\t" + "ldr q7, [%x[r], #64]\n\t" + "ldr q8, [%x[r], #96]\n\t" + "ldr q9, [%x[r], #128]\n\t" + "ldr q10, [%x[r], #160]\n\t" + "ldr q11, [%x[r], #192]\n\t" + "ldr q12, [%x[r], #224]\n\t" + "ldr q13, [x1]\n\t" + "ldr q14, [x1, #32]\n\t" + "ldr q15, [x1, #64]\n\t" + "ldr q16, [x1, #96]\n\t" + "ldr q17, [x1, #128]\n\t" + "ldr q18, [x1, #160]\n\t" + "ldr q19, [x1, #192]\n\t" + "ldr q20, [x1, #224]\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "mul v29.8h, v13.8h, v1.h[1]\n\t" + "mul v30.8h, v14.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" + "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[1]\n\t" + "mul v30.8h, v16.8h, v1.h[1]\n\t" + "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" + "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[1]\n\t" + "mul v30.8h, v18.8h, v1.h[1]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[1]\n\t" + "mul v30.8h, v20.8h, v1.h[1]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v13.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v14.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v15.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v16.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v9.8h, v25.8h\n\t" + "add v9.8h, v9.8h, v25.8h\n\t" + "sub v18.8h, v10.8h, v26.8h\n\t" + "add v10.8h, v10.8h, v26.8h\n\t" + "sub v19.8h, v11.8h, v27.8h\n\t" + "add v11.8h, v11.8h, v27.8h\n\t" + "sub v20.8h, v12.8h, v28.8h\n\t" + "add v12.8h, v12.8h, v28.8h\n\t" + "mul v29.8h, v9.8h, v1.h[2]\n\t" + "mul v30.8h, v10.8h, v1.h[2]\n\t" + "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" + "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[2]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[3]\n\t" + "mul v30.8h, v18.8h, v1.h[3]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[3]\n\t" + "mul v30.8h, v20.8h, v1.h[3]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v9.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v10.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v12.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v18.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v15.8h, v27.8h\n\t" + "add v15.8h, v15.8h, v27.8h\n\t" + "sub v20.8h, v16.8h, v28.8h\n\t" + "add v16.8h, v16.8h, v28.8h\n\t" + "mul v29.8h, v7.8h, v1.h[4]\n\t" + "mul v30.8h, v8.8h, v1.h[4]\n\t" + "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[5]\n\t" + "mul v30.8h, v12.8h, v1.h[5]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[6]\n\t" + "mul v30.8h, v16.8h, v1.h[6]\n\t" + "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[7]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v7.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v10.8h, v24.8h\n\t" + "add v10.8h, v10.8h, v24.8h\n\t" + "sub v15.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v18.8h, v28.8h\n\t" + "add v18.8h, v18.8h, v28.8h\n\t" + "ldr q0, [x2, #16]\n\t" + "ldr q1, [x3, #16]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "str q5, [%x[r]]\n\t" + "str q6, [%x[r], #32]\n\t" + "str q7, [%x[r], #64]\n\t" + "str q8, [%x[r], #96]\n\t" + "str q9, [%x[r], #128]\n\t" + "str q10, [%x[r], #160]\n\t" + "str q11, [%x[r], #192]\n\t" + "str q12, [%x[r], #224]\n\t" + "str q13, [x1]\n\t" + "str q14, [x1, #32]\n\t" + "str q15, [x1, #64]\n\t" + "str q16, [x1, #96]\n\t" + "str q17, [x1, #128]\n\t" + "str q18, [x1, #160]\n\t" + "str q19, [x1, #192]\n\t" + "str q20, [x1, #224]\n\t" + "ldr q5, [%x[r], #16]\n\t" + "ldr q6, [%x[r], #48]\n\t" + "ldr q7, [%x[r], #80]\n\t" + "ldr q8, [%x[r], #112]\n\t" + "ldr q9, [%x[r], #144]\n\t" + "ldr q10, [%x[r], #176]\n\t" + "ldr q11, [%x[r], #208]\n\t" + "ldr q12, [%x[r], #240]\n\t" + "ldr q13, [x1, #16]\n\t" + "ldr q14, [x1, #48]\n\t" + "ldr q15, [x1, #80]\n\t" + "ldr q16, [x1, #112]\n\t" + "ldr q17, [x1, #144]\n\t" + "ldr q18, [x1, #176]\n\t" + "ldr q19, [x1, #208]\n\t" + "ldr q20, [x1, #240]\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x3]\n\t" + "mul v29.8h, v13.8h, v1.h[1]\n\t" + "mul v30.8h, v14.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v13.8h, v0.h[1]\n\t" + "sqrdmulh v22.8h, v14.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[1]\n\t" + "mul v30.8h, v16.8h, v1.h[1]\n\t" + "sqrdmulh v23.8h, v15.8h, v0.h[1]\n\t" + "sqrdmulh v24.8h, v16.8h, v0.h[1]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[1]\n\t" + "mul v30.8h, v18.8h, v1.h[1]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[1]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[1]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[1]\n\t" + "mul v30.8h, v20.8h, v1.h[1]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[1]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[1]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v13.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v14.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v15.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v16.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v9.8h, v25.8h\n\t" + "add v9.8h, v9.8h, v25.8h\n\t" + "sub v18.8h, v10.8h, v26.8h\n\t" + "add v10.8h, v10.8h, v26.8h\n\t" + "sub v19.8h, v11.8h, v27.8h\n\t" + "add v11.8h, v11.8h, v27.8h\n\t" + "sub v20.8h, v12.8h, v28.8h\n\t" + "add v12.8h, v12.8h, v28.8h\n\t" + "mul v29.8h, v9.8h, v1.h[2]\n\t" + "mul v30.8h, v10.8h, v1.h[2]\n\t" + "sqrdmulh v21.8h, v9.8h, v0.h[2]\n\t" + "sqrdmulh v22.8h, v10.8h, v0.h[2]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[2]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[2]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v17.8h, v1.h[3]\n\t" + "mul v30.8h, v18.8h, v1.h[3]\n\t" + "sqrdmulh v25.8h, v17.8h, v0.h[3]\n\t" + "sqrdmulh v26.8h, v18.8h, v0.h[3]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[3]\n\t" + "mul v30.8h, v20.8h, v1.h[3]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[3]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[3]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v9.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v10.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v7.8h, v23.8h\n\t" + "add v7.8h, v7.8h, v23.8h\n\t" + "sub v12.8h, v8.8h, v24.8h\n\t" + "add v8.8h, v8.8h, v24.8h\n\t" + "sub v17.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v18.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v15.8h, v27.8h\n\t" + "add v15.8h, v15.8h, v27.8h\n\t" + "sub v20.8h, v16.8h, v28.8h\n\t" + "add v16.8h, v16.8h, v28.8h\n\t" + "mul v29.8h, v7.8h, v1.h[4]\n\t" + "mul v30.8h, v8.8h, v1.h[4]\n\t" + "sqrdmulh v21.8h, v7.8h, v0.h[4]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[4]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v11.8h, v1.h[5]\n\t" + "mul v30.8h, v12.8h, v1.h[5]\n\t" + "sqrdmulh v23.8h, v11.8h, v0.h[5]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[5]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v15.8h, v1.h[6]\n\t" + "mul v30.8h, v16.8h, v1.h[6]\n\t" + "sqrdmulh v25.8h, v15.8h, v0.h[6]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[6]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v19.8h, v1.h[7]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v19.8h, v0.h[7]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v7.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v6.8h, v22.8h\n\t" + "add v6.8h, v6.8h, v22.8h\n\t" + "sub v11.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v10.8h, v24.8h\n\t" + "add v10.8h, v10.8h, v24.8h\n\t" + "sub v15.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v14.8h, v26.8h\n\t" + "add v14.8h, v14.8h, v26.8h\n\t" + "sub v19.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v18.8h, v28.8h\n\t" + "add v18.8h, v18.8h, v28.8h\n\t" + "ldr q0, [x2, #16]\n\t" + "ldr q1, [x3, #16]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "str q5, [%x[r], #16]\n\t" + "str q6, [%x[r], #48]\n\t" + "str q7, [%x[r], #80]\n\t" + "str q8, [%x[r], #112]\n\t" + "str q9, [%x[r], #144]\n\t" + "str q10, [%x[r], #176]\n\t" + "str q11, [%x[r], #208]\n\t" + "str q12, [%x[r], #240]\n\t" + "str q13, [x1, #16]\n\t" + "str q14, [x1, #48]\n\t" + "str q15, [x1, #80]\n\t" + "str q16, [x1, #112]\n\t" + "str q17, [x1, #144]\n\t" + "str q18, [x1, #176]\n\t" + "str q19, [x1, #208]\n\t" + "str q20, [x1, #240]\n\t" + "ldp q5, q6, [%x[r]]\n\t" + "ldp q7, q8, [%x[r], #32]\n\t" + "ldp q9, q10, [%x[r], #64]\n\t" + "ldp q11, q12, [%x[r], #96]\n\t" + "ldp q13, q14, [%x[r], #128]\n\t" + "ldp q15, q16, [%x[r], #160]\n\t" + "ldp q17, q18, [%x[r], #192]\n\t" + "ldp q19, q20, [%x[r], #224]\n\t" + "ldr q0, [x2, #32]\n\t" + "ldr q1, [x3, #32]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #64]\n\t" + "ldr q2, [x2, #80]\n\t" + "ldr q1, [x3, #64]\n\t" + "ldr q3, [x3, #80]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "trn2 v8.2d, v30.2d, v8.2d\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #96]\n\t" + "ldr q2, [x2, #112]\n\t" + "ldr q1, [x3, #96]\n\t" + "ldr q3, [x3, #112]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "trn2 v12.2d, v30.2d, v12.2d\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #128]\n\t" + "ldr q2, [x2, #144]\n\t" + "ldr q1, [x3, #128]\n\t" + "ldr q3, [x3, #144]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "trn2 v16.2d, v30.2d, v16.2d\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #160]\n\t" + "ldr q2, [x2, #176]\n\t" + "ldr q1, [x3, #160]\n\t" + "ldr q3, [x3, #176]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "trn2 v20.2d, v30.2d, v20.2d\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #320]\n\t" + "ldr q2, [x2, #336]\n\t" + "ldr q1, [x3, #320]\n\t" + "ldr q3, [x3, #336]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "trn2 v8.4s, v30.4s, v8.4s\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #352]\n\t" + "ldr q2, [x2, #368]\n\t" + "ldr q1, [x3, #352]\n\t" + "ldr q3, [x3, #368]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "trn2 v12.4s, v30.4s, v12.4s\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #384]\n\t" + "ldr q2, [x2, #400]\n\t" + "ldr q1, [x3, #384]\n\t" + "ldr q3, [x3, #400]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "trn2 v16.4s, v30.4s, v16.4s\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #416]\n\t" + "ldr q2, [x2, #432]\n\t" + "ldr q1, [x3, #416]\n\t" + "ldr q3, [x3, #432]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "trn2 v20.4s, v30.4s, v20.4s\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "sqdmulh v21.8h, v5.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v6.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v5.8h, v21.8h, v4.h[0]\n\t" + "mls v6.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v7.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v8.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v7.8h, v21.8h, v4.h[0]\n\t" + "mls v8.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v9.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v10.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v9.8h, v21.8h, v4.h[0]\n\t" + "mls v10.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v11.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v12.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v11.8h, v21.8h, v4.h[0]\n\t" + "mls v12.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v13.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v14.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v13.8h, v21.8h, v4.h[0]\n\t" + "mls v14.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v15.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v16.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v15.8h, v21.8h, v4.h[0]\n\t" + "mls v16.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v17.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v18.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v17.8h, v21.8h, v4.h[0]\n\t" + "mls v18.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v19.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v20.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v19.8h, v21.8h, v4.h[0]\n\t" + "mls v20.8h, v22.8h, v4.h[0]\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v8.4s, v29.4s, v8.4s\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v8.2d, v29.2d, v8.2d\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v29.4s, v12.4s\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v29.2d, v12.2d\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v29.4s, v16.4s\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v29.2d, v16.2d\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v29.4s, v20.4s\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v29.2d, v20.2d\n\t" + "stp q5, q6, [%x[r]]\n\t" + "stp q7, q8, [%x[r], #32]\n\t" + "stp q9, q10, [%x[r], #64]\n\t" + "stp q11, q12, [%x[r], #96]\n\t" + "stp q13, q14, [%x[r], #128]\n\t" + "stp q15, q16, [%x[r], #160]\n\t" + "stp q17, q18, [%x[r], #192]\n\t" + "stp q19, q20, [%x[r], #224]\n\t" + "ldp q5, q6, [x1]\n\t" + "ldp q7, q8, [x1, #32]\n\t" + "ldp q9, q10, [x1, #64]\n\t" + "ldp q11, q12, [x1, #96]\n\t" + "ldp q13, q14, [x1, #128]\n\t" + "ldp q15, q16, [x1, #160]\n\t" + "ldp q17, q18, [x1, #192]\n\t" + "ldp q19, q20, [x1, #224]\n\t" + "ldr q0, [x2, #48]\n\t" + "ldr q1, [x3, #48]\n\t" + "mul v29.8h, v6.8h, v1.h[0]\n\t" + "mul v30.8h, v8.8h, v1.h[1]\n\t" + "sqrdmulh v21.8h, v6.8h, v0.h[0]\n\t" + "sqrdmulh v22.8h, v8.8h, v0.h[1]\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v29.8h, v10.8h, v1.h[2]\n\t" + "mul v30.8h, v12.8h, v1.h[3]\n\t" + "sqrdmulh v23.8h, v10.8h, v0.h[2]\n\t" + "sqrdmulh v24.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v29.8h, v14.8h, v1.h[4]\n\t" + "mul v30.8h, v16.8h, v1.h[5]\n\t" + "sqrdmulh v25.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v26.8h, v16.8h, v0.h[5]\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "mul v29.8h, v18.8h, v1.h[6]\n\t" + "mul v30.8h, v20.8h, v1.h[7]\n\t" + "sqrdmulh v27.8h, v18.8h, v0.h[6]\n\t" + "sqrdmulh v28.8h, v20.8h, v0.h[7]\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #192]\n\t" + "ldr q2, [x2, #208]\n\t" + "ldr q1, [x3, #192]\n\t" + "ldr q3, [x3, #208]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "trn2 v8.2d, v30.2d, v8.2d\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #224]\n\t" + "ldr q2, [x2, #240]\n\t" + "ldr q1, [x3, #224]\n\t" + "ldr q3, [x3, #240]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "trn2 v12.2d, v30.2d, v12.2d\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #256]\n\t" + "ldr q2, [x2, #272]\n\t" + "ldr q1, [x3, #256]\n\t" + "ldr q3, [x3, #272]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "trn2 v16.2d, v30.2d, v16.2d\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #288]\n\t" + "ldr q2, [x2, #304]\n\t" + "ldr q1, [x3, #288]\n\t" + "ldr q3, [x3, #304]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "trn2 v20.2d, v30.2d, v20.2d\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "ldr q0, [x2, #448]\n\t" + "ldr q2, [x2, #464]\n\t" + "ldr q1, [x3, #448]\n\t" + "ldr q3, [x3, #464]\n\t" + "mov v29.16b, v5.16b\n\t" + "mov v30.16b, v7.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "trn2 v8.4s, v30.4s, v8.4s\n\t" + "mul v29.8h, v6.8h, v1.8h\n\t" + "mul v30.8h, v8.8h, v3.8h\n\t" + "sqrdmulh v21.8h, v6.8h, v0.8h\n\t" + "sqrdmulh v22.8h, v8.8h, v2.8h\n\t" + "sqrdmlsh v21.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v22.8h, v30.8h, v4.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "ldr q0, [x2, #480]\n\t" + "ldr q2, [x2, #496]\n\t" + "ldr q1, [x3, #480]\n\t" + "ldr q3, [x3, #496]\n\t" + "mov v29.16b, v9.16b\n\t" + "mov v30.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "trn2 v12.4s, v30.4s, v12.4s\n\t" + "mul v29.8h, v10.8h, v1.8h\n\t" + "mul v30.8h, v12.8h, v3.8h\n\t" + "sqrdmulh v23.8h, v10.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v12.8h, v2.8h\n\t" + "sqrdmlsh v23.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v24.8h, v30.8h, v4.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #512]\n\t" + "ldr q2, [x2, #528]\n\t" + "ldr q1, [x3, #512]\n\t" + "ldr q3, [x3, #528]\n\t" + "mov v29.16b, v13.16b\n\t" + "mov v30.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "trn2 v16.4s, v30.4s, v16.4s\n\t" + "mul v29.8h, v14.8h, v1.8h\n\t" + "mul v30.8h, v16.8h, v3.8h\n\t" + "sqrdmulh v25.8h, v14.8h, v0.8h\n\t" + "sqrdmulh v26.8h, v16.8h, v2.8h\n\t" + "sqrdmlsh v25.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v26.8h, v30.8h, v4.h[0]\n\t" + "sshr v25.8h, v25.8h, #1\n\t" + "sshr v26.8h, v26.8h, #1\n\t" + "ldr q0, [x2, #544]\n\t" + "ldr q2, [x2, #560]\n\t" + "ldr q1, [x3, #544]\n\t" + "ldr q3, [x3, #560]\n\t" + "mov v29.16b, v17.16b\n\t" + "mov v30.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "trn2 v20.4s, v30.4s, v20.4s\n\t" + "mul v29.8h, v18.8h, v1.8h\n\t" + "mul v30.8h, v20.8h, v3.8h\n\t" + "sqrdmulh v27.8h, v18.8h, v0.8h\n\t" + "sqrdmulh v28.8h, v20.8h, v2.8h\n\t" + "sqrdmlsh v27.8h, v29.8h, v4.h[0]\n\t" + "sqrdmlsh v28.8h, v30.8h, v4.h[0]\n\t" + "sshr v27.8h, v27.8h, #1\n\t" + "sshr v28.8h, v28.8h, #1\n\t" + "sub v6.8h, v5.8h, v21.8h\n\t" + "add v5.8h, v5.8h, v21.8h\n\t" + "sub v8.8h, v7.8h, v22.8h\n\t" + "add v7.8h, v7.8h, v22.8h\n\t" + "sub v10.8h, v9.8h, v23.8h\n\t" + "add v9.8h, v9.8h, v23.8h\n\t" + "sub v12.8h, v11.8h, v24.8h\n\t" + "add v11.8h, v11.8h, v24.8h\n\t" + "sub v14.8h, v13.8h, v25.8h\n\t" + "add v13.8h, v13.8h, v25.8h\n\t" + "sub v16.8h, v15.8h, v26.8h\n\t" + "add v15.8h, v15.8h, v26.8h\n\t" + "sub v18.8h, v17.8h, v27.8h\n\t" + "add v17.8h, v17.8h, v27.8h\n\t" + "sub v20.8h, v19.8h, v28.8h\n\t" + "add v19.8h, v19.8h, v28.8h\n\t" + "sqdmulh v21.8h, v5.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v6.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v5.8h, v21.8h, v4.h[0]\n\t" + "mls v6.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v7.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v8.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v7.8h, v21.8h, v4.h[0]\n\t" + "mls v8.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v9.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v10.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v9.8h, v21.8h, v4.h[0]\n\t" + "mls v10.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v11.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v12.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v11.8h, v21.8h, v4.h[0]\n\t" + "mls v12.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v13.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v14.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v13.8h, v21.8h, v4.h[0]\n\t" + "mls v14.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v15.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v16.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v15.8h, v21.8h, v4.h[0]\n\t" + "mls v16.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v17.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v18.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v17.8h, v21.8h, v4.h[0]\n\t" + "mls v18.8h, v22.8h, v4.h[0]\n\t" + "sqdmulh v21.8h, v19.8h, v4.h[2]\n\t" + "sqdmulh v22.8h, v20.8h, v4.h[2]\n\t" + "sshr v21.8h, v21.8h, #11\n\t" + "sshr v22.8h, v22.8h, #11\n\t" + "mls v19.8h, v21.8h, v4.h[0]\n\t" + "mls v20.8h, v22.8h, v4.h[0]\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.4s, v5.4s, v6.4s\n\t" + "trn2 v6.4s, v29.4s, v6.4s\n\t" + "mov v29.16b, v5.16b\n\t" + "trn1 v5.2d, v5.2d, v6.2d\n\t" + "trn2 v6.2d, v29.2d, v6.2d\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.4s, v7.4s, v8.4s\n\t" + "trn2 v8.4s, v29.4s, v8.4s\n\t" + "mov v29.16b, v7.16b\n\t" + "trn1 v7.2d, v7.2d, v8.2d\n\t" + "trn2 v8.2d, v29.2d, v8.2d\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v29.4s, v10.4s\n\t" + "mov v29.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v29.2d, v10.2d\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v29.4s, v12.4s\n\t" + "mov v29.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v29.2d, v12.2d\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v29.4s, v14.4s\n\t" + "mov v29.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v29.2d, v14.2d\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v29.4s, v16.4s\n\t" + "mov v29.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v29.2d, v16.2d\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v29.4s, v18.4s\n\t" + "mov v29.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v29.2d, v18.2d\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v29.4s, v20.4s\n\t" + "mov v29.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v29.2d, v20.2d\n\t" + "stp q5, q6, [x1]\n\t" + "stp q7, q8, [x1, #32]\n\t" + "stp q9, q10, [x1, #64]\n\t" + "stp q11, q12, [x1, #96]\n\t" + "stp q13, q14, [x1, #128]\n\t" + "stp q15, q16, [x1, #160]\n\t" + "stp q17, q18, [x1, #192]\n\t" + "stp q19, q20, [x1, #224]\n\t" + : [r] "+r" (r) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv) + : "memory", "x1", "x2", "x3", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "cc" + ); +} + +void kyber_invntt_sqrdmlsh(sword16* r) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x2, %[L_kyber_aarch64_zetas_inv]\n\t" + "add x2, x2, :lo12:%[L_kyber_aarch64_zetas_inv]\n\t" +#else + "adrp x2, %[L_kyber_aarch64_zetas_inv]@PAGE\n\t" + "add x2, x2, %[L_kyber_aarch64_zetas_inv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x3, %[L_kyber_aarch64_zetas_inv_qinv]\n\t" + "add x3, x3, :lo12:%[L_kyber_aarch64_zetas_inv_qinv]\n\t" +#else + "adrp x3, %[L_kyber_aarch64_zetas_inv_qinv]@PAGE\n\t" + "add x3, x3, %[L_kyber_aarch64_zetas_inv_qinv]@PAGEOFF\n\t" +#endif /* __APPLE__ */ +#ifndef __APPLE__ + "adrp x4, %[L_kyber_aarch64_consts]\n\t" + "add x4, x4, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x4, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x4, x4, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "add x1, %x[r], #0x100\n\t" + "ldr q8, [x4]\n\t" + "ldp q9, q10, [%x[r]]\n\t" + "ldp q11, q12, [%x[r], #32]\n\t" + "ldp q13, q14, [%x[r], #64]\n\t" + "ldp q15, q16, [%x[r], #96]\n\t" + "ldp q17, q18, [%x[r], #128]\n\t" + "ldp q19, q20, [%x[r], #160]\n\t" + "ldp q21, q22, [%x[r], #192]\n\t" + "ldp q23, q24, [%x[r], #224]\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v25.2d, v12.2d\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v25.4s, v12.4s\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v25.2d, v16.2d\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v25.4s, v16.4s\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v25.2d, v20.2d\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v25.4s, v20.4s\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v24.2d, v25.2d, v24.2d\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v24.4s, v25.4s, v24.4s\n\t" + "ldr q0, [x2]\n\t" + "ldr q1, [x2, #16]\n\t" + "ldr q2, [x3]\n\t" + "ldr q3, [x3, #16]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #32]\n\t" + "ldr q1, [x2, #48]\n\t" + "ldr q2, [x3, #32]\n\t" + "ldr q3, [x3, #48]\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #64]\n\t" + "ldr q1, [x2, #80]\n\t" + "ldr q2, [x3, #64]\n\t" + "ldr q3, [x3, #80]\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #96]\n\t" + "ldr q1, [x2, #112]\n\t" + "ldr q2, [x3, #96]\n\t" + "ldr q3, [x3, #112]\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #256]\n\t" + "ldr q1, [x2, #272]\n\t" + "ldr q2, [x3, #256]\n\t" + "ldr q3, [x3, #272]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "trn2 v12.4s, v26.4s, v12.4s\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #288]\n\t" + "ldr q1, [x2, #304]\n\t" + "ldr q2, [x3, #288]\n\t" + "ldr q3, [x3, #304]\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "trn2 v16.4s, v26.4s, v16.4s\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #320]\n\t" + "ldr q1, [x2, #336]\n\t" + "ldr q2, [x3, #320]\n\t" + "ldr q3, [x3, #336]\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "trn2 v20.4s, v26.4s, v20.4s\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #352]\n\t" + "ldr q1, [x2, #368]\n\t" + "ldr q2, [x3, #352]\n\t" + "ldr q3, [x3, #368]\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "trn2 v24.4s, v26.4s, v24.4s\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #512]\n\t" + "ldr q2, [x3, #512]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "trn2 v12.2d, v26.2d, v12.2d\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.h[0]\n\t" + "mul v27.8h, v28.8h, v2.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "trn2 v16.2d, v26.2d, v16.2d\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.h[2]\n\t" + "mul v27.8h, v28.8h, v2.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "trn2 v20.2d, v26.2d, v20.2d\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.h[4]\n\t" + "mul v27.8h, v28.8h, v2.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "trn2 v24.2d, v26.2d, v24.2d\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.h[6]\n\t" + "mul v27.8h, v28.8h, v2.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v11.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v11.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v13.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v15.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v13.8h, v25.8h, v8.h[0]\n\t" + "mls v15.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v19.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v19.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v21.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v23.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v21.8h, v25.8h, v8.h[0]\n\t" + "mls v23.8h, v26.8h, v8.h[0]\n\t" + "stp q9, q10, [%x[r]]\n\t" + "stp q11, q12, [%x[r], #32]\n\t" + "stp q13, q14, [%x[r], #64]\n\t" + "stp q15, q16, [%x[r], #96]\n\t" + "stp q17, q18, [%x[r], #128]\n\t" + "stp q19, q20, [%x[r], #160]\n\t" + "stp q21, q22, [%x[r], #192]\n\t" + "stp q23, q24, [%x[r], #224]\n\t" + "ldp q9, q10, [x1]\n\t" + "ldp q11, q12, [x1, #32]\n\t" + "ldp q13, q14, [x1, #64]\n\t" + "ldp q15, q16, [x1, #96]\n\t" + "ldp q17, q18, [x1, #128]\n\t" + "ldp q19, q20, [x1, #160]\n\t" + "ldp q21, q22, [x1, #192]\n\t" + "ldp q23, q24, [x1, #224]\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "mov v25.16b, v9.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v12.2d, v25.2d, v12.2d\n\t" + "mov v25.16b, v11.16b\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v12.4s, v25.4s, v12.4s\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "mov v25.16b, v13.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v16.2d, v25.2d, v16.2d\n\t" + "mov v25.16b, v15.16b\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v16.4s, v25.4s, v16.4s\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "mov v25.16b, v17.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v20.2d, v25.2d, v20.2d\n\t" + "mov v25.16b, v19.16b\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v20.4s, v25.4s, v20.4s\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "mov v25.16b, v21.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v24.2d, v25.2d, v24.2d\n\t" + "mov v25.16b, v23.16b\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v24.4s, v25.4s, v24.4s\n\t" + "ldr q0, [x2, #128]\n\t" + "ldr q1, [x2, #144]\n\t" + "ldr q2, [x3, #128]\n\t" + "ldr q3, [x3, #144]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #160]\n\t" + "ldr q1, [x2, #176]\n\t" + "ldr q2, [x3, #160]\n\t" + "ldr q3, [x3, #176]\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #192]\n\t" + "ldr q1, [x2, #208]\n\t" + "ldr q2, [x3, #192]\n\t" + "ldr q3, [x3, #208]\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #224]\n\t" + "ldr q1, [x2, #240]\n\t" + "ldr q2, [x3, #224]\n\t" + "ldr q3, [x3, #240]\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #384]\n\t" + "ldr q1, [x2, #400]\n\t" + "ldr q2, [x3, #384]\n\t" + "ldr q3, [x3, #400]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.4s, v9.4s, v10.4s\n\t" + "trn1 v11.4s, v11.4s, v12.4s\n\t" + "trn2 v10.4s, v25.4s, v10.4s\n\t" + "trn2 v12.4s, v26.4s, v12.4s\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v10.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v12.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "ldr q0, [x2, #416]\n\t" + "ldr q1, [x2, #432]\n\t" + "ldr q2, [x3, #416]\n\t" + "ldr q3, [x3, #432]\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.4s, v13.4s, v14.4s\n\t" + "trn1 v15.4s, v15.4s, v16.4s\n\t" + "trn2 v14.4s, v25.4s, v14.4s\n\t" + "trn2 v16.4s, v26.4s, v16.4s\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v14.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v16.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "ldr q0, [x2, #448]\n\t" + "ldr q1, [x2, #464]\n\t" + "ldr q2, [x3, #448]\n\t" + "ldr q3, [x3, #464]\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.4s, v17.4s, v18.4s\n\t" + "trn1 v19.4s, v19.4s, v20.4s\n\t" + "trn2 v18.4s, v25.4s, v18.4s\n\t" + "trn2 v20.4s, v26.4s, v20.4s\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v18.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v20.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "ldr q0, [x2, #480]\n\t" + "ldr q1, [x2, #496]\n\t" + "ldr q2, [x3, #480]\n\t" + "ldr q3, [x3, #496]\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.4s, v21.4s, v22.4s\n\t" + "trn1 v23.4s, v23.4s, v24.4s\n\t" + "trn2 v22.4s, v25.4s, v22.4s\n\t" + "trn2 v24.4s, v26.4s, v24.4s\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.8h\n\t" + "mul v27.8h, v28.8h, v3.8h\n\t" + "sqrdmulh v22.8h, v26.8h, v0.8h\n\t" + "sqrdmulh v24.8h, v28.8h, v1.8h\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "ldr q0, [x2, #528]\n\t" + "ldr q2, [x3, #528]\n\t" + "mov v25.16b, v9.16b\n\t" + "mov v26.16b, v11.16b\n\t" + "trn1 v9.2d, v9.2d, v10.2d\n\t" + "trn1 v11.2d, v11.2d, v12.2d\n\t" + "trn2 v10.2d, v25.2d, v10.2d\n\t" + "trn2 v12.2d, v26.2d, v12.2d\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v2.h[0]\n\t" + "mul v27.8h, v28.8h, v2.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v0.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v0.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mov v25.16b, v13.16b\n\t" + "mov v26.16b, v15.16b\n\t" + "trn1 v13.2d, v13.2d, v14.2d\n\t" + "trn1 v15.2d, v15.2d, v16.2d\n\t" + "trn2 v14.2d, v25.2d, v14.2d\n\t" + "trn2 v16.2d, v26.2d, v16.2d\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v2.h[2]\n\t" + "mul v27.8h, v28.8h, v2.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v0.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v0.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mov v25.16b, v17.16b\n\t" + "mov v26.16b, v19.16b\n\t" + "trn1 v17.2d, v17.2d, v18.2d\n\t" + "trn1 v19.2d, v19.2d, v20.2d\n\t" + "trn2 v18.2d, v25.2d, v18.2d\n\t" + "trn2 v20.2d, v26.2d, v20.2d\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v2.h[4]\n\t" + "mul v27.8h, v28.8h, v2.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v0.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v0.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mov v25.16b, v21.16b\n\t" + "mov v26.16b, v23.16b\n\t" + "trn1 v21.2d, v21.2d, v22.2d\n\t" + "trn1 v23.2d, v23.2d, v24.2d\n\t" + "trn2 v22.2d, v25.2d, v22.2d\n\t" + "trn2 v24.2d, v26.2d, v24.2d\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v2.h[6]\n\t" + "mul v27.8h, v28.8h, v2.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v0.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v0.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v11.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v11.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v13.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v15.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v13.8h, v25.8h, v8.h[0]\n\t" + "mls v15.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v19.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v19.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v21.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v23.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v21.8h, v25.8h, v8.h[0]\n\t" + "mls v23.8h, v26.8h, v8.h[0]\n\t" + "stp q9, q10, [x1]\n\t" + "stp q11, q12, [x1, #32]\n\t" + "stp q13, q14, [x1, #64]\n\t" + "stp q15, q16, [x1, #96]\n\t" + "stp q17, q18, [x1, #128]\n\t" + "stp q19, q20, [x1, #160]\n\t" + "stp q21, q22, [x1, #192]\n\t" + "stp q23, q24, [x1, #224]\n\t" + "ldr q4, [x2, #544]\n\t" + "ldr q5, [x2, #560]\n\t" + "ldr q6, [x3, #544]\n\t" + "ldr q7, [x3, #560]\n\t" + "ldr q9, [%x[r]]\n\t" + "ldr q10, [%x[r], #32]\n\t" + "ldr q11, [%x[r], #64]\n\t" + "ldr q12, [%x[r], #96]\n\t" + "ldr q13, [%x[r], #128]\n\t" + "ldr q14, [%x[r], #160]\n\t" + "ldr q15, [%x[r], #192]\n\t" + "ldr q16, [%x[r], #224]\n\t" + "ldr q17, [x1]\n\t" + "ldr q18, [x1, #32]\n\t" + "ldr q19, [x1, #64]\n\t" + "ldr q20, [x1, #96]\n\t" + "ldr q21, [x1, #128]\n\t" + "ldr q22, [x1, #160]\n\t" + "ldr q23, [x1, #192]\n\t" + "ldr q24, [x1, #224]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v6.h[0]\n\t" + "mul v27.8h, v28.8h, v6.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v6.h[2]\n\t" + "mul v27.8h, v28.8h, v6.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v6.h[4]\n\t" + "mul v27.8h, v28.8h, v6.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v6.h[6]\n\t" + "mul v27.8h, v28.8h, v6.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v11.8h\n\t" + "sub v28.8h, v10.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v11.8h\n\t" + "add v10.8h, v10.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v7.h[0]\n\t" + "mul v27.8h, v28.8h, v7.h[0]\n\t" + "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v15.8h\n\t" + "sub v28.8h, v14.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v15.8h\n\t" + "add v14.8h, v14.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[1]\n\t" + "mul v27.8h, v28.8h, v7.h[1]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v19.8h\n\t" + "sub v28.8h, v18.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v19.8h\n\t" + "add v18.8h, v18.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[2]\n\t" + "mul v27.8h, v28.8h, v7.h[2]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v23.8h\n\t" + "sub v28.8h, v22.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v23.8h\n\t" + "add v22.8h, v22.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[3]\n\t" + "mul v27.8h, v28.8h, v7.h[3]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v13.8h\n\t" + "sub v28.8h, v10.8h, v14.8h\n\t" + "add v9.8h, v9.8h, v13.8h\n\t" + "add v10.8h, v10.8h, v14.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sub v26.8h, v11.8h, v15.8h\n\t" + "sub v28.8h, v12.8h, v16.8h\n\t" + "add v11.8h, v11.8h, v15.8h\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v21.8h\n\t" + "sub v28.8h, v18.8h, v22.8h\n\t" + "add v17.8h, v17.8h, v21.8h\n\t" + "add v18.8h, v18.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v19.8h, v23.8h\n\t" + "sub v28.8h, v20.8h, v24.8h\n\t" + "add v19.8h, v19.8h, v23.8h\n\t" + "add v20.8h, v20.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v10.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v10.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v11.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v12.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v11.8h, v25.8h, v8.h[0]\n\t" + "mls v12.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v18.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v18.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v19.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v20.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v19.8h, v25.8h, v8.h[0]\n\t" + "mls v20.8h, v26.8h, v8.h[0]\n\t" + "sub v26.8h, v9.8h, v17.8h\n\t" + "sub v28.8h, v10.8h, v18.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sub v26.8h, v11.8h, v19.8h\n\t" + "sub v28.8h, v12.8h, v20.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "add v12.8h, v12.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v13.8h, v21.8h\n\t" + "sub v28.8h, v14.8h, v22.8h\n\t" + "add v13.8h, v13.8h, v21.8h\n\t" + "add v14.8h, v14.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v15.8h, v23.8h\n\t" + "sub v28.8h, v16.8h, v24.8h\n\t" + "add v15.8h, v15.8h, v23.8h\n\t" + "add v16.8h, v16.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "mul v25.8h, v9.8h, v7.h[7]\n\t" + "mul v26.8h, v10.8h, v7.h[7]\n\t" + "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" + "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" + "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v25.8h, v11.8h, v7.h[7]\n\t" + "mul v26.8h, v12.8h, v7.h[7]\n\t" + "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" + "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v25.8h, v13.8h, v7.h[7]\n\t" + "mul v26.8h, v14.8h, v7.h[7]\n\t" + "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" + "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v25.8h, v15.8h, v7.h[7]\n\t" + "mul v26.8h, v16.8h, v7.h[7]\n\t" + "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" + "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "mul v25.8h, v17.8h, v7.h[7]\n\t" + "mul v26.8h, v18.8h, v7.h[7]\n\t" + "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" + "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "mul v25.8h, v19.8h, v7.h[7]\n\t" + "mul v26.8h, v20.8h, v7.h[7]\n\t" + "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" + "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "mul v25.8h, v21.8h, v7.h[7]\n\t" + "mul v26.8h, v22.8h, v7.h[7]\n\t" + "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" + "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "mul v25.8h, v23.8h, v7.h[7]\n\t" + "mul v26.8h, v24.8h, v7.h[7]\n\t" + "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" + "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "str q9, [%x[r]]\n\t" + "str q10, [%x[r], #32]\n\t" + "str q11, [%x[r], #64]\n\t" + "str q12, [%x[r], #96]\n\t" + "str q13, [%x[r], #128]\n\t" + "str q14, [%x[r], #160]\n\t" + "str q15, [%x[r], #192]\n\t" + "str q16, [%x[r], #224]\n\t" + "str q17, [x1]\n\t" + "str q18, [x1, #32]\n\t" + "str q19, [x1, #64]\n\t" + "str q20, [x1, #96]\n\t" + "str q21, [x1, #128]\n\t" + "str q22, [x1, #160]\n\t" + "str q23, [x1, #192]\n\t" + "str q24, [x1, #224]\n\t" + "ldr q9, [%x[r], #16]\n\t" + "ldr q10, [%x[r], #48]\n\t" + "ldr q11, [%x[r], #80]\n\t" + "ldr q12, [%x[r], #112]\n\t" + "ldr q13, [%x[r], #144]\n\t" + "ldr q14, [%x[r], #176]\n\t" + "ldr q15, [%x[r], #208]\n\t" + "ldr q16, [%x[r], #240]\n\t" + "ldr q17, [x1, #16]\n\t" + "ldr q18, [x1, #48]\n\t" + "ldr q19, [x1, #80]\n\t" + "ldr q20, [x1, #112]\n\t" + "ldr q21, [x1, #144]\n\t" + "ldr q22, [x1, #176]\n\t" + "ldr q23, [x1, #208]\n\t" + "ldr q24, [x1, #240]\n\t" + "sub v26.8h, v9.8h, v10.8h\n\t" + "sub v28.8h, v11.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v10.8h\n\t" + "add v11.8h, v11.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v6.h[0]\n\t" + "mul v27.8h, v28.8h, v6.h[1]\n\t" + "sqrdmulh v10.8h, v26.8h, v4.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v4.h[1]\n\t" + "sqrdmlsh v10.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v14.8h\n\t" + "sub v28.8h, v15.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v14.8h\n\t" + "add v15.8h, v15.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v6.h[2]\n\t" + "mul v27.8h, v28.8h, v6.h[3]\n\t" + "sqrdmulh v14.8h, v26.8h, v4.h[2]\n\t" + "sqrdmulh v16.8h, v28.8h, v4.h[3]\n\t" + "sqrdmlsh v14.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v18.8h\n\t" + "sub v28.8h, v19.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v18.8h\n\t" + "add v19.8h, v19.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v6.h[4]\n\t" + "mul v27.8h, v28.8h, v6.h[5]\n\t" + "sqrdmulh v18.8h, v26.8h, v4.h[4]\n\t" + "sqrdmulh v20.8h, v28.8h, v4.h[5]\n\t" + "sqrdmlsh v18.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v22.8h\n\t" + "sub v28.8h, v23.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v22.8h\n\t" + "add v23.8h, v23.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v6.h[6]\n\t" + "mul v27.8h, v28.8h, v6.h[7]\n\t" + "sqrdmulh v22.8h, v26.8h, v4.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v4.h[7]\n\t" + "sqrdmlsh v22.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v11.8h\n\t" + "sub v28.8h, v10.8h, v12.8h\n\t" + "add v9.8h, v9.8h, v11.8h\n\t" + "add v10.8h, v10.8h, v12.8h\n\t" + "mul v25.8h, v26.8h, v7.h[0]\n\t" + "mul v27.8h, v28.8h, v7.h[0]\n\t" + "sqrdmulh v11.8h, v26.8h, v5.h[0]\n\t" + "sqrdmulh v12.8h, v28.8h, v5.h[0]\n\t" + "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v12.8h, v27.8h, v8.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "sub v26.8h, v13.8h, v15.8h\n\t" + "sub v28.8h, v14.8h, v16.8h\n\t" + "add v13.8h, v13.8h, v15.8h\n\t" + "add v14.8h, v14.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[1]\n\t" + "mul v27.8h, v28.8h, v7.h[1]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[1]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[1]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v19.8h\n\t" + "sub v28.8h, v18.8h, v20.8h\n\t" + "add v17.8h, v17.8h, v19.8h\n\t" + "add v18.8h, v18.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[2]\n\t" + "mul v27.8h, v28.8h, v7.h[2]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[2]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[2]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v21.8h, v23.8h\n\t" + "sub v28.8h, v22.8h, v24.8h\n\t" + "add v21.8h, v21.8h, v23.8h\n\t" + "add v22.8h, v22.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[3]\n\t" + "mul v27.8h, v28.8h, v7.h[3]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[3]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[3]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sub v26.8h, v9.8h, v13.8h\n\t" + "sub v28.8h, v10.8h, v14.8h\n\t" + "add v9.8h, v9.8h, v13.8h\n\t" + "add v10.8h, v10.8h, v14.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v13.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v14.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v14.8h, v27.8h, v8.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "sub v26.8h, v11.8h, v15.8h\n\t" + "sub v28.8h, v12.8h, v16.8h\n\t" + "add v11.8h, v11.8h, v15.8h\n\t" + "add v12.8h, v12.8h, v16.8h\n\t" + "mul v25.8h, v26.8h, v7.h[4]\n\t" + "mul v27.8h, v28.8h, v7.h[4]\n\t" + "sqrdmulh v15.8h, v26.8h, v5.h[4]\n\t" + "sqrdmulh v16.8h, v28.8h, v5.h[4]\n\t" + "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v16.8h, v27.8h, v8.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "sub v26.8h, v17.8h, v21.8h\n\t" + "sub v28.8h, v18.8h, v22.8h\n\t" + "add v17.8h, v17.8h, v21.8h\n\t" + "add v18.8h, v18.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v19.8h, v23.8h\n\t" + "sub v28.8h, v20.8h, v24.8h\n\t" + "add v19.8h, v19.8h, v23.8h\n\t" + "add v20.8h, v20.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[5]\n\t" + "mul v27.8h, v28.8h, v7.h[5]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[5]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[5]\n\t" + "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" + "sshr v23.8h, v23.8h, #1\n\t" + "sshr v24.8h, v24.8h, #1\n\t" + "sqdmulh v25.8h, v9.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v10.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v9.8h, v25.8h, v8.h[0]\n\t" + "mls v10.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v11.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v12.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v11.8h, v25.8h, v8.h[0]\n\t" + "mls v12.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v17.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v18.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v17.8h, v25.8h, v8.h[0]\n\t" + "mls v18.8h, v26.8h, v8.h[0]\n\t" + "sqdmulh v25.8h, v19.8h, v8.h[2]\n\t" + "sqdmulh v26.8h, v20.8h, v8.h[2]\n\t" + "sshr v25.8h, v25.8h, #11\n\t" + "sshr v26.8h, v26.8h, #11\n\t" + "mls v19.8h, v25.8h, v8.h[0]\n\t" + "mls v20.8h, v26.8h, v8.h[0]\n\t" + "sub v26.8h, v9.8h, v17.8h\n\t" + "sub v28.8h, v10.8h, v18.8h\n\t" + "add v9.8h, v9.8h, v17.8h\n\t" + "add v10.8h, v10.8h, v18.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v17.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v18.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v18.8h, v27.8h, v8.h[0]\n\t" + "sshr v17.8h, v17.8h, #1\n\t" + "sshr v18.8h, v18.8h, #1\n\t" + "sub v26.8h, v11.8h, v19.8h\n\t" + "sub v28.8h, v12.8h, v20.8h\n\t" + "add v11.8h, v11.8h, v19.8h\n\t" + "add v12.8h, v12.8h, v20.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v19.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v20.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v20.8h, v27.8h, v8.h[0]\n\t" + "sshr v19.8h, v19.8h, #1\n\t" + "sshr v20.8h, v20.8h, #1\n\t" + "sub v26.8h, v13.8h, v21.8h\n\t" + "sub v28.8h, v14.8h, v22.8h\n\t" + "add v13.8h, v13.8h, v21.8h\n\t" + "add v14.8h, v14.8h, v22.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v21.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v22.8h, v28.8h, v5.h[6]\n\t" + "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" + "sqrdmlsh v22.8h, v27.8h, v8.h[0]\n\t" + "sshr v21.8h, v21.8h, #1\n\t" + "sshr v22.8h, v22.8h, #1\n\t" + "sub v26.8h, v15.8h, v23.8h\n\t" + "sub v28.8h, v16.8h, v24.8h\n\t" + "add v15.8h, v15.8h, v23.8h\n\t" + "add v16.8h, v16.8h, v24.8h\n\t" + "mul v25.8h, v26.8h, v7.h[6]\n\t" + "mul v27.8h, v28.8h, v7.h[6]\n\t" + "sqrdmulh v23.8h, v26.8h, v5.h[6]\n\t" + "sqrdmulh v24.8h, v28.8h, v5.h[6]\n\t" "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v24.8h, v27.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v27.8h, v27.8h, v8.h[0]\n\t" - "sub v23.8h, v23.8h, v25.8h\n\t" - "sub v24.8h, v24.8h, v27.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "mul v25.8h, v9.8h, v7.h[7]\n\t" "mul v26.8h, v10.8h, v7.h[7]\n\t" "sqrdmulh v9.8h, v9.8h, v5.h[7]\n\t" "sqrdmulh v10.8h, v10.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v9.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v10.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v9.8h, v9.8h, v25.8h\n\t" - "sub v10.8h, v10.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v9.8h, v9.8h, #1\n\t" "sshr v10.8h, v10.8h, #1\n\t" "mul v25.8h, v11.8h, v7.h[7]\n\t" "mul v26.8h, v12.8h, v7.h[7]\n\t" "sqrdmulh v11.8h, v11.8h, v5.h[7]\n\t" "sqrdmulh v12.8h, v12.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v11.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v12.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v11.8h, v11.8h, v25.8h\n\t" - "sub v12.8h, v12.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v11.8h, v11.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "mul v25.8h, v13.8h, v7.h[7]\n\t" "mul v26.8h, v14.8h, v7.h[7]\n\t" "sqrdmulh v13.8h, v13.8h, v5.h[7]\n\t" "sqrdmulh v14.8h, v14.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v13.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v14.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v13.8h, v13.8h, v25.8h\n\t" - "sub v14.8h, v14.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v13.8h, v13.8h, #1\n\t" "sshr v14.8h, v14.8h, #1\n\t" "mul v25.8h, v15.8h, v7.h[7]\n\t" "mul v26.8h, v16.8h, v7.h[7]\n\t" "sqrdmulh v15.8h, v15.8h, v5.h[7]\n\t" "sqrdmulh v16.8h, v16.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v15.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v16.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v15.8h, v15.8h, v25.8h\n\t" - "sub v16.8h, v16.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "mul v25.8h, v17.8h, v7.h[7]\n\t" "mul v26.8h, v18.8h, v7.h[7]\n\t" "sqrdmulh v17.8h, v17.8h, v5.h[7]\n\t" "sqrdmulh v18.8h, v18.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v17.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v18.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v17.8h, v17.8h, v25.8h\n\t" - "sub v18.8h, v18.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v17.8h, v17.8h, #1\n\t" "sshr v18.8h, v18.8h, #1\n\t" "mul v25.8h, v19.8h, v7.h[7]\n\t" "mul v26.8h, v20.8h, v7.h[7]\n\t" "sqrdmulh v19.8h, v19.8h, v5.h[7]\n\t" "sqrdmulh v20.8h, v20.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v19.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v20.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v19.8h, v19.8h, v25.8h\n\t" - "sub v20.8h, v20.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v19.8h, v19.8h, #1\n\t" "sshr v20.8h, v20.8h, #1\n\t" "mul v25.8h, v21.8h, v7.h[7]\n\t" "mul v26.8h, v22.8h, v7.h[7]\n\t" "sqrdmulh v21.8h, v21.8h, v5.h[7]\n\t" "sqrdmulh v22.8h, v22.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v21.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v22.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v21.8h, v21.8h, v25.8h\n\t" - "sub v22.8h, v22.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v21.8h, v21.8h, #1\n\t" "sshr v22.8h, v22.8h, #1\n\t" "mul v25.8h, v23.8h, v7.h[7]\n\t" "mul v26.8h, v24.8h, v7.h[7]\n\t" "sqrdmulh v23.8h, v23.8h, v5.h[7]\n\t" "sqrdmulh v24.8h, v24.8h, v5.h[7]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH "sqrdmlsh v23.8h, v25.8h, v8.h[0]\n\t" "sqrdmlsh v24.8h, v26.8h, v8.h[0]\n\t" -#else - "sqrdmulh v25.8h, v25.8h, v8.h[0]\n\t" - "sqrdmulh v26.8h, v26.8h, v8.h[0]\n\t" - "sub v23.8h, v23.8h, v25.8h\n\t" - "sub v24.8h, v24.8h, v26.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v23.8h, v23.8h, #1\n\t" "sshr v24.8h, v24.8h, #1\n\t" "str q9, [%x[r], #16]\n\t" @@ -4596,135 +5375,24 @@ void kyber_invntt(sword16* r) ); } +#endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */ static const word16 L_kyber_aarch64_zetas_mul[] = { - 0x8b2, - 0xf74e, - 0x1ae, - 0xfe52, - 0x22b, - 0xfdd5, - 0x34b, - 0xfcb5, - 0x81e, - 0xf7e2, - 0x367, - 0xfc99, - 0x60e, - 0xf9f2, - 0x69, - 0xff97, - 0x1a6, - 0xfe5a, - 0x24b, - 0xfdb5, - 0xb1, - 0xff4f, - 0xc16, - 0xf3ea, - 0xbde, - 0xf422, - 0xb35, - 0xf4cb, - 0x626, - 0xf9da, - 0x675, - 0xf98b, - 0xc0b, - 0xf3f5, - 0x30a, - 0xfcf6, - 0x487, - 0xfb79, - 0xc6e, - 0xf392, - 0x9f8, - 0xf608, - 0x5cb, - 0xfa35, - 0xaa7, - 0xf559, - 0x45f, - 0xfba1, - 0x6cb, - 0xf935, - 0x284, - 0xfd7c, - 0x999, - 0xf667, - 0x15d, - 0xfea3, - 0x1a2, - 0xfe5e, - 0x149, - 0xfeb7, - 0xc65, - 0xf39b, - 0xcb6, - 0xf34a, - 0x331, - 0xfccf, - 0x449, - 0xfbb7, - 0x25b, - 0xfda5, - 0x262, - 0xfd9e, - 0x52a, - 0xfad6, - 0x7fc, - 0xf804, - 0x748, - 0xf8b8, - 0x180, - 0xfe80, - 0x842, - 0xf7be, - 0xc79, - 0xf387, - 0x4c2, - 0xfb3e, - 0x7ca, - 0xf836, - 0x997, - 0xf669, - 0xdc, - 0xff24, - 0x85e, - 0xf7a2, - 0x686, - 0xf97a, - 0x860, - 0xf7a0, - 0x707, - 0xf8f9, - 0x803, - 0xf7fd, - 0x31a, - 0xfce6, - 0x71b, - 0xf8e5, - 0x9ab, - 0xf655, - 0x99b, - 0xf665, - 0x1de, - 0xfe22, - 0xc95, - 0xf36b, - 0xbcd, - 0xf433, - 0x3e4, - 0xfc1c, - 0x3df, - 0xfc21, - 0x3be, - 0xfc42, - 0x74d, - 0xf8b3, - 0x5f2, - 0xfa0e, - 0x65c, - 0xf9a4, + 0x08b2, 0xf74e, 0x01ae, 0xfe52, 0x022b, 0xfdd5, 0x034b, 0xfcb5, + 0x081e, 0xf7e2, 0x0367, 0xfc99, 0x060e, 0xf9f2, 0x0069, 0xff97, + 0x01a6, 0xfe5a, 0x024b, 0xfdb5, 0x00b1, 0xff4f, 0x0c16, 0xf3ea, + 0x0bde, 0xf422, 0x0b35, 0xf4cb, 0x0626, 0xf9da, 0x0675, 0xf98b, + 0x0c0b, 0xf3f5, 0x030a, 0xfcf6, 0x0487, 0xfb79, 0x0c6e, 0xf392, + 0x09f8, 0xf608, 0x05cb, 0xfa35, 0x0aa7, 0xf559, 0x045f, 0xfba1, + 0x06cb, 0xf935, 0x0284, 0xfd7c, 0x0999, 0xf667, 0x015d, 0xfea3, + 0x01a2, 0xfe5e, 0x0149, 0xfeb7, 0x0c65, 0xf39b, 0x0cb6, 0xf34a, + 0x0331, 0xfccf, 0x0449, 0xfbb7, 0x025b, 0xfda5, 0x0262, 0xfd9e, + 0x052a, 0xfad6, 0x07fc, 0xf804, 0x0748, 0xf8b8, 0x0180, 0xfe80, + 0x0842, 0xf7be, 0x0c79, 0xf387, 0x04c2, 0xfb3e, 0x07ca, 0xf836, + 0x0997, 0xf669, 0x00dc, 0xff24, 0x085e, 0xf7a2, 0x0686, 0xf97a, + 0x0860, 0xf7a0, 0x0707, 0xf8f9, 0x0803, 0xf7fd, 0x031a, 0xfce6, + 0x071b, 0xf8e5, 0x09ab, 0xf655, 0x099b, 0xf665, 0x01de, 0xfe22, + 0x0c95, 0xf36b, 0x0bcd, 0xf433, 0x03e4, 0xfc1c, 0x03df, 0xfc21, + 0x03be, 0xfc42, 0x074d, 0xf8b3, 0x05f2, 0xfa0e, 0x065c, 0xf9a4, }; void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b) @@ -6877,120 +7545,80 @@ void kyber_to_mont(sword16* p) "mul v18.8h, v2.8h, v0.h[4]\n\t" "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v1.8h, v1.8h, v17.8h\n\t" "sub v2.8h, v2.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v1.8h, v1.8h, #1\n\t" "sshr v2.8h, v2.8h, #1\n\t" "mul v17.8h, v3.8h, v0.h[4]\n\t" "mul v18.8h, v4.8h, v0.h[4]\n\t" "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v3.8h, v3.8h, v17.8h\n\t" "sub v4.8h, v4.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v3.8h, v3.8h, #1\n\t" "sshr v4.8h, v4.8h, #1\n\t" "mul v17.8h, v5.8h, v0.h[4]\n\t" "mul v18.8h, v6.8h, v0.h[4]\n\t" "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v5.8h, v5.8h, v17.8h\n\t" "sub v6.8h, v6.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v5.8h, v5.8h, #1\n\t" "sshr v6.8h, v6.8h, #1\n\t" "mul v17.8h, v7.8h, v0.h[4]\n\t" "mul v18.8h, v8.8h, v0.h[4]\n\t" "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v7.8h, v7.8h, v17.8h\n\t" "sub v8.8h, v8.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v7.8h, v7.8h, #1\n\t" "sshr v8.8h, v8.8h, #1\n\t" "mul v17.8h, v9.8h, v0.h[4]\n\t" "mul v18.8h, v10.8h, v0.h[4]\n\t" "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v9.8h, v9.8h, v17.8h\n\t" "sub v10.8h, v10.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v9.8h, v9.8h, #1\n\t" "sshr v10.8h, v10.8h, #1\n\t" "mul v17.8h, v11.8h, v0.h[4]\n\t" "mul v18.8h, v12.8h, v0.h[4]\n\t" "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v11.8h, v11.8h, v17.8h\n\t" "sub v12.8h, v12.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v11.8h, v11.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "mul v17.8h, v13.8h, v0.h[4]\n\t" "mul v18.8h, v14.8h, v0.h[4]\n\t" "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v13.8h, v13.8h, v17.8h\n\t" "sub v14.8h, v14.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v13.8h, v13.8h, #1\n\t" "sshr v14.8h, v14.8h, #1\n\t" "mul v17.8h, v15.8h, v0.h[4]\n\t" "mul v18.8h, v16.8h, v0.h[4]\n\t" "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v15.8h, v15.8h, v17.8h\n\t" "sub v16.8h, v16.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" @@ -7006,120 +7634,80 @@ void kyber_to_mont(sword16* p) "mul v18.8h, v2.8h, v0.h[4]\n\t" "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v1.8h, v1.8h, v17.8h\n\t" "sub v2.8h, v2.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v1.8h, v1.8h, #1\n\t" "sshr v2.8h, v2.8h, #1\n\t" "mul v17.8h, v3.8h, v0.h[4]\n\t" "mul v18.8h, v4.8h, v0.h[4]\n\t" "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v3.8h, v3.8h, v17.8h\n\t" "sub v4.8h, v4.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v3.8h, v3.8h, #1\n\t" "sshr v4.8h, v4.8h, #1\n\t" "mul v17.8h, v5.8h, v0.h[4]\n\t" "mul v18.8h, v6.8h, v0.h[4]\n\t" "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v5.8h, v5.8h, v17.8h\n\t" "sub v6.8h, v6.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v5.8h, v5.8h, #1\n\t" "sshr v6.8h, v6.8h, #1\n\t" "mul v17.8h, v7.8h, v0.h[4]\n\t" "mul v18.8h, v8.8h, v0.h[4]\n\t" "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v7.8h, v7.8h, v17.8h\n\t" "sub v8.8h, v8.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v7.8h, v7.8h, #1\n\t" "sshr v8.8h, v8.8h, #1\n\t" "mul v17.8h, v9.8h, v0.h[4]\n\t" "mul v18.8h, v10.8h, v0.h[4]\n\t" "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v9.8h, v9.8h, v17.8h\n\t" "sub v10.8h, v10.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v9.8h, v9.8h, #1\n\t" "sshr v10.8h, v10.8h, #1\n\t" "mul v17.8h, v11.8h, v0.h[4]\n\t" "mul v18.8h, v12.8h, v0.h[4]\n\t" "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v11.8h, v11.8h, v17.8h\n\t" "sub v12.8h, v12.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v11.8h, v11.8h, #1\n\t" "sshr v12.8h, v12.8h, #1\n\t" "mul v17.8h, v13.8h, v0.h[4]\n\t" "mul v18.8h, v14.8h, v0.h[4]\n\t" "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v13.8h, v13.8h, v17.8h\n\t" "sub v14.8h, v14.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v13.8h, v13.8h, #1\n\t" "sshr v14.8h, v14.8h, #1\n\t" "mul v17.8h, v15.8h, v0.h[4]\n\t" "mul v18.8h, v16.8h, v0.h[4]\n\t" "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" -#ifndef WOLFSSL_AARCH64_NO_SQRMLSH - "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" - "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" -#else "sqrdmulh v17.8h, v17.8h, v0.h[0]\n\t" "sqrdmulh v18.8h, v18.8h, v0.h[0]\n\t" "sub v15.8h, v15.8h, v17.8h\n\t" "sub v16.8h, v16.8h, v18.8h\n\t" -#endif /* !WOLFSSL_AARCH64_NO_SQRMLSH */ "sshr v15.8h, v15.8h, #1\n\t" "sshr v16.8h, v16.8h, #1\n\t" "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" @@ -7132,37 +7720,181 @@ void kyber_to_mont(sword16* p) ); } +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH +void kyber_to_mont_sqrdmlsh(sword16* p) +{ + __asm__ __volatile__ ( +#ifndef __APPLE__ + "adrp x1, %[L_kyber_aarch64_consts]\n\t" + "add x1, x1, :lo12:%[L_kyber_aarch64_consts]\n\t" +#else + "adrp x1, %[L_kyber_aarch64_consts]@PAGE\n\t" + "add x1, x1, %[L_kyber_aarch64_consts]@PAGEOFF\n\t" +#endif /* __APPLE__ */ + "ldr q0, [x1]\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "mul v17.8h, v1.8h, v0.h[4]\n\t" + "mul v18.8h, v2.8h, v0.h[4]\n\t" + "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" + "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" + "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" + "sshr v1.8h, v1.8h, #1\n\t" + "sshr v2.8h, v2.8h, #1\n\t" + "mul v17.8h, v3.8h, v0.h[4]\n\t" + "mul v18.8h, v4.8h, v0.h[4]\n\t" + "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" + "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" + "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" + "sshr v3.8h, v3.8h, #1\n\t" + "sshr v4.8h, v4.8h, #1\n\t" + "mul v17.8h, v5.8h, v0.h[4]\n\t" + "mul v18.8h, v6.8h, v0.h[4]\n\t" + "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" + "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" + "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" + "sshr v5.8h, v5.8h, #1\n\t" + "sshr v6.8h, v6.8h, #1\n\t" + "mul v17.8h, v7.8h, v0.h[4]\n\t" + "mul v18.8h, v8.8h, v0.h[4]\n\t" + "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" + "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" + "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" + "sshr v7.8h, v7.8h, #1\n\t" + "sshr v8.8h, v8.8h, #1\n\t" + "mul v17.8h, v9.8h, v0.h[4]\n\t" + "mul v18.8h, v10.8h, v0.h[4]\n\t" + "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" + "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" + "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v17.8h, v11.8h, v0.h[4]\n\t" + "mul v18.8h, v12.8h, v0.h[4]\n\t" + "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" + "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v17.8h, v13.8h, v0.h[4]\n\t" + "mul v18.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" + "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" + "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v17.8h, v15.8h, v0.h[4]\n\t" + "mul v18.8h, v16.8h, v0.h[4]\n\t" + "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" + "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" + "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + "sub %x[p], %x[p], #0x100\n\t" + "mul v17.8h, v1.8h, v0.h[4]\n\t" + "mul v18.8h, v2.8h, v0.h[4]\n\t" + "sqrdmulh v1.8h, v1.8h, v0.h[3]\n\t" + "sqrdmulh v2.8h, v2.8h, v0.h[3]\n\t" + "sqrdmlsh v1.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v2.8h, v18.8h, v0.h[0]\n\t" + "sshr v1.8h, v1.8h, #1\n\t" + "sshr v2.8h, v2.8h, #1\n\t" + "mul v17.8h, v3.8h, v0.h[4]\n\t" + "mul v18.8h, v4.8h, v0.h[4]\n\t" + "sqrdmulh v3.8h, v3.8h, v0.h[3]\n\t" + "sqrdmulh v4.8h, v4.8h, v0.h[3]\n\t" + "sqrdmlsh v3.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v4.8h, v18.8h, v0.h[0]\n\t" + "sshr v3.8h, v3.8h, #1\n\t" + "sshr v4.8h, v4.8h, #1\n\t" + "mul v17.8h, v5.8h, v0.h[4]\n\t" + "mul v18.8h, v6.8h, v0.h[4]\n\t" + "sqrdmulh v5.8h, v5.8h, v0.h[3]\n\t" + "sqrdmulh v6.8h, v6.8h, v0.h[3]\n\t" + "sqrdmlsh v5.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v6.8h, v18.8h, v0.h[0]\n\t" + "sshr v5.8h, v5.8h, #1\n\t" + "sshr v6.8h, v6.8h, #1\n\t" + "mul v17.8h, v7.8h, v0.h[4]\n\t" + "mul v18.8h, v8.8h, v0.h[4]\n\t" + "sqrdmulh v7.8h, v7.8h, v0.h[3]\n\t" + "sqrdmulh v8.8h, v8.8h, v0.h[3]\n\t" + "sqrdmlsh v7.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v8.8h, v18.8h, v0.h[0]\n\t" + "sshr v7.8h, v7.8h, #1\n\t" + "sshr v8.8h, v8.8h, #1\n\t" + "mul v17.8h, v9.8h, v0.h[4]\n\t" + "mul v18.8h, v10.8h, v0.h[4]\n\t" + "sqrdmulh v9.8h, v9.8h, v0.h[3]\n\t" + "sqrdmulh v10.8h, v10.8h, v0.h[3]\n\t" + "sqrdmlsh v9.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v10.8h, v18.8h, v0.h[0]\n\t" + "sshr v9.8h, v9.8h, #1\n\t" + "sshr v10.8h, v10.8h, #1\n\t" + "mul v17.8h, v11.8h, v0.h[4]\n\t" + "mul v18.8h, v12.8h, v0.h[4]\n\t" + "sqrdmulh v11.8h, v11.8h, v0.h[3]\n\t" + "sqrdmulh v12.8h, v12.8h, v0.h[3]\n\t" + "sqrdmlsh v11.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v12.8h, v18.8h, v0.h[0]\n\t" + "sshr v11.8h, v11.8h, #1\n\t" + "sshr v12.8h, v12.8h, #1\n\t" + "mul v17.8h, v13.8h, v0.h[4]\n\t" + "mul v18.8h, v14.8h, v0.h[4]\n\t" + "sqrdmulh v13.8h, v13.8h, v0.h[3]\n\t" + "sqrdmulh v14.8h, v14.8h, v0.h[3]\n\t" + "sqrdmlsh v13.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v14.8h, v18.8h, v0.h[0]\n\t" + "sshr v13.8h, v13.8h, #1\n\t" + "sshr v14.8h, v14.8h, #1\n\t" + "mul v17.8h, v15.8h, v0.h[4]\n\t" + "mul v18.8h, v16.8h, v0.h[4]\n\t" + "sqrdmulh v15.8h, v15.8h, v0.h[3]\n\t" + "sqrdmulh v16.8h, v16.8h, v0.h[3]\n\t" + "sqrdmlsh v15.8h, v17.8h, v0.h[0]\n\t" + "sqrdmlsh v16.8h, v18.8h, v0.h[0]\n\t" + "sshr v15.8h, v15.8h, #1\n\t" + "sshr v16.8h, v16.8h, #1\n\t" + "st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [%x[p]], #0x40\n\t" + "st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [%x[p]], #0x40\n\t" + "st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [%x[p]], #0x40\n\t" + "st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [%x[p]], #0x40\n\t" + : [p] "+r" (p) + : [L_kyber_aarch64_q] "S" (L_kyber_aarch64_q), [L_kyber_aarch64_consts] "S" (L_kyber_aarch64_consts), [L_sha3_aarch64_r] "S" (L_sha3_aarch64_r), [L_kyber_aarch64_zetas] "S" (L_kyber_aarch64_zetas), [L_kyber_aarch64_zetas_qinv] "S" (L_kyber_aarch64_zetas_qinv), [L_kyber_aarch64_zetas_inv] "S" (L_kyber_aarch64_zetas_inv), [L_kyber_aarch64_zetas_inv_qinv] "S" (L_kyber_aarch64_zetas_inv_qinv), [L_kyber_aarch64_zetas_mul] "S" (L_kyber_aarch64_zetas_mul) + : "memory", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "cc" + ); +} + +#endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */ static const word16 L_kyber_aarch64_to_msg_neon_low[] = { - 0x373, - 0x373, - 0x373, - 0x373, - 0x373, - 0x373, - 0x373, - 0x373, + 0x0373, 0x0373, 0x0373, 0x0373, 0x0373, 0x0373, 0x0373, 0x0373, }; static const word16 L_kyber_aarch64_to_msg_neon_high[] = { - 0x9c0, - 0x9c0, - 0x9c0, - 0x9c0, - 0x9c0, - 0x9c0, - 0x9c0, - 0x9c0, + 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, 0x09c0, }; static const word16 L_kyber_aarch64_to_msg_neon_bits[] = { - 0x1, - 0x2, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, + 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, }; void kyber_to_msg_neon(byte* msg, sword16* p) @@ -7399,33 +8131,12 @@ void kyber_to_msg_neon(byte* msg, sword16* p) } static const word16 L_kyber_aarch64_from_msg_neon_q1half[] = { - 0x681, - 0x681, - 0x681, - 0x681, - 0x681, - 0x681, - 0x681, - 0x681, + 0x0681, 0x0681, 0x0681, 0x0681, 0x0681, 0x0681, 0x0681, 0x0681, }; static const word8 L_kyber_aarch64_from_msg_neon_bits[] = { - 0x1, - 0x2, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, - 0x1, - 0x2, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; void kyber_from_msg_neon(sword16* p, const byte* msg) @@ -7857,4124 +8568,526 @@ int kyber_cmp_neon(const byte* a, const byte* b, int sz) } static const word16 L_kyber_aarch64_rej_uniform_neon_mask[] = { - 0xfff, - 0xfff, - 0xfff, - 0xfff, - 0xfff, - 0xfff, - 0xfff, - 0xfff, + 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, 0x0fff, }; static const word16 L_kyber_aarch64_rej_uniform_neon_bits[] = { - 0x1, - 0x2, - 0x4, - 0x8, - 0x10, - 0x20, - 0x40, - 0x80, + 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, }; static const word8 L_kyber_aarch64_rej_uniform_neon_indices[] = { - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xff, - 0xff, - 0xff, - 0xff, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xff, - 0xff, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xe, - 0xf, - 0xff, - 0xff, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0, - 0x1, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, - 0xff, - 0xff, - 0x0, - 0x1, - 0x2, - 0x3, - 0x4, - 0x5, - 0x6, - 0x7, - 0x8, - 0x9, - 0xa, - 0xb, - 0xc, - 0xd, - 0xe, - 0xf, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0xff, 0xff, 0xff, 0xff, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0e, 0x0f, 0xff, 0xff, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x08, 0x09, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, }; unsigned int kyber_rej_uniform_neon(sword16* p, unsigned int len, const byte* r, unsigned int rLen) diff --git a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c index 9db2f1488d..3ae8b90217 100644 --- a/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha3-asm_c.c @@ -37,30 +37,18 @@ #ifdef WOLFSSL_SHA3 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 static const word64 L_SHA3_transform_crypto_r[] = { - 0x1UL, - 0x8082UL, - 0x800000000000808aUL, - 0x8000000080008000UL, - 0x808bUL, - 0x80000001UL, - 0x8000000080008081UL, - 0x8000000000008009UL, - 0x8aUL, - 0x88UL, - 0x80008009UL, - 0x8000000aUL, - 0x8000808bUL, - 0x800000000000008bUL, - 0x8000000000008089UL, - 0x8000000000008003UL, - 0x8000000000008002UL, - 0x8000000000000080UL, - 0x800aUL, - 0x800000008000000aUL, - 0x8000000080008081UL, - 0x8000000000008080UL, - 0x80000001UL, - 0x8000000080008008UL, + 0x0000000000000001, 0x0000000000008082, + 0x800000000000808a, 0x8000000080008000, + 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, + 0x000000000000008a, 0x0000000000000088, + 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, + 0x8000000000008089, 0x8000000000008003, + 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, + 0x8000000080008081, 0x8000000000008080, + 0x0000000080000001, 0x8000000080008008, }; void BlockSha3_crypto(word64* state) @@ -183,30 +171,18 @@ void BlockSha3_crypto(word64* state) #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ static const word64 L_SHA3_transform_base_r[] = { - 0x1UL, - 0x8082UL, - 0x800000000000808aUL, - 0x8000000080008000UL, - 0x808bUL, - 0x80000001UL, - 0x8000000080008081UL, - 0x8000000000008009UL, - 0x8aUL, - 0x88UL, - 0x80008009UL, - 0x8000000aUL, - 0x8000808bUL, - 0x800000000000008bUL, - 0x8000000000008089UL, - 0x8000000000008003UL, - 0x8000000000008002UL, - 0x8000000000000080UL, - 0x800aUL, - 0x800000008000000aUL, - 0x8000000080008081UL, - 0x8000000000008080UL, - 0x80000001UL, - 0x8000000080008008UL, + 0x0000000000000001, 0x0000000000008082, + 0x800000000000808a, 0x8000000080008000, + 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, + 0x000000000000008a, 0x0000000000000088, + 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, + 0x8000000000008089, 0x8000000000008003, + 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, + 0x8000000080008081, 0x8000000000008080, + 0x0000000080000001, 0x8000000080008008, }; void BlockSha3_base(word64* state) diff --git a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c index e1c038536c..9241998192 100644 --- a/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c +++ b/wolfcrypt/src/port/arm/armv8-sha512-asm_c.c @@ -36,91 +36,50 @@ #ifdef WOLFSSL_SHA512 static const word64 L_SHA512_transform_neon_len_k[] = { - 0x428a2f98d728ae22UL, - 0x7137449123ef65cdUL, - 0xb5c0fbcfec4d3b2fUL, - 0xe9b5dba58189dbbcUL, - 0x3956c25bf348b538UL, - 0x59f111f1b605d019UL, - 0x923f82a4af194f9bUL, - 0xab1c5ed5da6d8118UL, - 0xd807aa98a3030242UL, - 0x12835b0145706fbeUL, - 0x243185be4ee4b28cUL, - 0x550c7dc3d5ffb4e2UL, - 0x72be5d74f27b896fUL, - 0x80deb1fe3b1696b1UL, - 0x9bdc06a725c71235UL, - 0xc19bf174cf692694UL, - 0xe49b69c19ef14ad2UL, - 0xefbe4786384f25e3UL, - 0xfc19dc68b8cd5b5UL, - 0x240ca1cc77ac9c65UL, - 0x2de92c6f592b0275UL, - 0x4a7484aa6ea6e483UL, - 0x5cb0a9dcbd41fbd4UL, - 0x76f988da831153b5UL, - 0x983e5152ee66dfabUL, - 0xa831c66d2db43210UL, - 0xb00327c898fb213fUL, - 0xbf597fc7beef0ee4UL, - 0xc6e00bf33da88fc2UL, - 0xd5a79147930aa725UL, - 0x6ca6351e003826fUL, - 0x142929670a0e6e70UL, - 0x27b70a8546d22ffcUL, - 0x2e1b21385c26c926UL, - 0x4d2c6dfc5ac42aedUL, - 0x53380d139d95b3dfUL, - 0x650a73548baf63deUL, - 0x766a0abb3c77b2a8UL, - 0x81c2c92e47edaee6UL, - 0x92722c851482353bUL, - 0xa2bfe8a14cf10364UL, - 0xa81a664bbc423001UL, - 0xc24b8b70d0f89791UL, - 0xc76c51a30654be30UL, - 0xd192e819d6ef5218UL, - 0xd69906245565a910UL, - 0xf40e35855771202aUL, - 0x106aa07032bbd1b8UL, - 0x19a4c116b8d2d0c8UL, - 0x1e376c085141ab53UL, - 0x2748774cdf8eeb99UL, - 0x34b0bcb5e19b48a8UL, - 0x391c0cb3c5c95a63UL, - 0x4ed8aa4ae3418acbUL, - 0x5b9cca4f7763e373UL, - 0x682e6ff3d6b2b8a3UL, - 0x748f82ee5defb2fcUL, - 0x78a5636f43172f60UL, - 0x84c87814a1f0ab72UL, - 0x8cc702081a6439ecUL, - 0x90befffa23631e28UL, - 0xa4506cebde82bde9UL, - 0xbef9a3f7b2c67915UL, - 0xc67178f2e372532bUL, - 0xca273eceea26619cUL, - 0xd186b8c721c0c207UL, - 0xeada7dd6cde0eb1eUL, - 0xf57d4f7fee6ed178UL, - 0x6f067aa72176fbaUL, - 0xa637dc5a2c898a6UL, - 0x113f9804bef90daeUL, - 0x1b710b35131c471bUL, - 0x28db77f523047d84UL, - 0x32caab7b40c72493UL, - 0x3c9ebe0a15c9bebcUL, - 0x431d67c49c100d4cUL, - 0x4cc5d4becb3e42b6UL, - 0x597f299cfc657e2aUL, - 0x5fcb6fab3ad6faecUL, - 0x6c44198c4a475817UL, + 0x428a2f98d728ae22, 0x7137449123ef65cd, + 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, + 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, + 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, + 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, + 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, + 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, + 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, + 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, + 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, + 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, + 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, + 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, + 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, + 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, + 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, }; static const word64 L_SHA512_transform_neon_len_ror8[] = { - 0x7060504030201UL, - 0x80f0e0d0c0b0a09UL, + 0x0007060504030201, 0x080f0e0d0c0b0a09, }; void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len) @@ -1054,86 +1013,46 @@ void Transform_Sha512_Len_neon(wc_Sha512* sha512, const byte* data, word32 len) #ifdef WOLFSSL_ARMASM_CRYPTO_SHA512 static const word64 L_SHA512_transform_crypto_len_k[] = { - 0x428a2f98d728ae22UL, - 0x7137449123ef65cdUL, - 0xb5c0fbcfec4d3b2fUL, - 0xe9b5dba58189dbbcUL, - 0x3956c25bf348b538UL, - 0x59f111f1b605d019UL, - 0x923f82a4af194f9bUL, - 0xab1c5ed5da6d8118UL, - 0xd807aa98a3030242UL, - 0x12835b0145706fbeUL, - 0x243185be4ee4b28cUL, - 0x550c7dc3d5ffb4e2UL, - 0x72be5d74f27b896fUL, - 0x80deb1fe3b1696b1UL, - 0x9bdc06a725c71235UL, - 0xc19bf174cf692694UL, - 0xe49b69c19ef14ad2UL, - 0xefbe4786384f25e3UL, - 0xfc19dc68b8cd5b5UL, - 0x240ca1cc77ac9c65UL, - 0x2de92c6f592b0275UL, - 0x4a7484aa6ea6e483UL, - 0x5cb0a9dcbd41fbd4UL, - 0x76f988da831153b5UL, - 0x983e5152ee66dfabUL, - 0xa831c66d2db43210UL, - 0xb00327c898fb213fUL, - 0xbf597fc7beef0ee4UL, - 0xc6e00bf33da88fc2UL, - 0xd5a79147930aa725UL, - 0x6ca6351e003826fUL, - 0x142929670a0e6e70UL, - 0x27b70a8546d22ffcUL, - 0x2e1b21385c26c926UL, - 0x4d2c6dfc5ac42aedUL, - 0x53380d139d95b3dfUL, - 0x650a73548baf63deUL, - 0x766a0abb3c77b2a8UL, - 0x81c2c92e47edaee6UL, - 0x92722c851482353bUL, - 0xa2bfe8a14cf10364UL, - 0xa81a664bbc423001UL, - 0xc24b8b70d0f89791UL, - 0xc76c51a30654be30UL, - 0xd192e819d6ef5218UL, - 0xd69906245565a910UL, - 0xf40e35855771202aUL, - 0x106aa07032bbd1b8UL, - 0x19a4c116b8d2d0c8UL, - 0x1e376c085141ab53UL, - 0x2748774cdf8eeb99UL, - 0x34b0bcb5e19b48a8UL, - 0x391c0cb3c5c95a63UL, - 0x4ed8aa4ae3418acbUL, - 0x5b9cca4f7763e373UL, - 0x682e6ff3d6b2b8a3UL, - 0x748f82ee5defb2fcUL, - 0x78a5636f43172f60UL, - 0x84c87814a1f0ab72UL, - 0x8cc702081a6439ecUL, - 0x90befffa23631e28UL, - 0xa4506cebde82bde9UL, - 0xbef9a3f7b2c67915UL, - 0xc67178f2e372532bUL, - 0xca273eceea26619cUL, - 0xd186b8c721c0c207UL, - 0xeada7dd6cde0eb1eUL, - 0xf57d4f7fee6ed178UL, - 0x6f067aa72176fbaUL, - 0xa637dc5a2c898a6UL, - 0x113f9804bef90daeUL, - 0x1b710b35131c471bUL, - 0x28db77f523047d84UL, - 0x32caab7b40c72493UL, - 0x3c9ebe0a15c9bebcUL, - 0x431d67c49c100d4cUL, - 0x4cc5d4becb3e42b6UL, - 0x597f299cfc657e2aUL, - 0x5fcb6fab3ad6faecUL, - 0x6c44198c4a475817UL, + 0x428a2f98d728ae22, 0x7137449123ef65cd, + 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, + 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, + 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, + 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, + 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, + 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, + 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, + 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, + 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, + 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, + 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, + 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, + 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, + 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, + 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, + 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, + 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, + 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, + 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, + 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, }; void Transform_Sha512_Len_crypto(wc_Sha512* sha512, const byte* data, word32 len); diff --git a/wolfcrypt/src/wc_kyber_poly.c b/wolfcrypt/src/wc_kyber_poly.c index 76b5cd5d77..37902b7e11 100644 --- a/wolfcrypt/src/wc_kyber_poly.c +++ b/wolfcrypt/src/wc_kyber_poly.c @@ -84,7 +84,8 @@ /* Declared in wc_kyber.c to stop compiler optimizer from simplifying. */ extern volatile sword16 kyber_opt_blocker; -#ifdef USE_INTEL_SPEEDUP +#if defined(USE_INTEL_SPEEDUP) || (defined(__aarch64__) && \ + defined(WOLFSSL_ARMASM)) static word32 cpuid_flags = 0; #endif @@ -1099,7 +1100,8 @@ static void kyber_pointwise_acc_mont(sword16* r, const sword16* a, */ void kyber_init(void) { -#ifdef USE_INTEL_SPEEDUP +#if defined(USE_INTEL_SPEEDUP) || (defined(__aarch64__) && \ + defined(WOLFSSL_ARMASM)) cpuid_flags = cpuid_get_flags(); #endif } @@ -1121,22 +1123,48 @@ void kyber_keygen(sword16* priv, sword16* pub, sword16* e, const sword16* a, { int i; - /* Transform private key. All of result used in public key calculation */ - for (i = 0; i < kp; ++i) { - kyber_ntt(priv + i * KYBER_N); +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH + if (IS_AARCH64_RDM(cpuid_flags)) { + /* Transform private key. All of result used in public key calculation. + */ + for (i = 0; i < kp; ++i) { + kyber_ntt_sqrdmlsh(priv + i * KYBER_N); + } + + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply a by private into public polynomial. */ + kyber_pointwise_acc_mont(pub + i * KYBER_N, a + i * kp * KYBER_N, + priv, kp); + /* Convert public polynomial to Montgomery form. */ + kyber_to_mont_sqrdmlsh(pub + i * KYBER_N); + /* Transform error values polynomial. */ + kyber_ntt_sqrdmlsh(e + i * KYBER_N); + /* Add errors to public key and reduce. */ + kyber_add_reduce(pub + i * KYBER_N, e + i * KYBER_N); + } } + else +#endif + { + /* Transform private key. All of result used in public key calculation. + */ + for (i = 0; i < kp; ++i) { + kyber_ntt(priv + i * KYBER_N); + } - /* For each polynomial in the vectors. */ - for (i = 0; i < kp; ++i) { - /* Multiply a by private into public polynomial. */ - kyber_pointwise_acc_mont(pub + i * KYBER_N, a + i * kp * KYBER_N, priv, - kp); - /* Convert public polynomial to Montgomery form. */ - kyber_to_mont(pub + i * KYBER_N); - /* Transform error values polynomial. */ - kyber_ntt(e + i * KYBER_N); - /* Add errors to public key and reduce. */ - kyber_add_reduce(pub + i * KYBER_N, e + i * KYBER_N); + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply a by private into public polynomial. */ + kyber_pointwise_acc_mont(pub + i * KYBER_N, a + i * kp * KYBER_N, + priv, kp); + /* Convert public polynomial to Montgomery form. */ + kyber_to_mont(pub + i * KYBER_N); + /* Transform error values polynomial. */ + kyber_ntt(e + i * KYBER_N); + /* Add errors to public key and reduce. */ + kyber_add_reduce(pub + i * KYBER_N, e + i * KYBER_N); + } } } @@ -1158,26 +1186,53 @@ void kyber_encapsulate(const sword16* pub, sword16* bp, sword16* v, { int i; - /* Transform sp. All of result used in calculation of bp and v. */ - for (i = 0; i < kp; ++i) { - kyber_ntt(sp + i * KYBER_N); - } +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH + if (IS_AARCH64_RDM(cpuid_flags)) { + /* Transform sp. All of result used in calculation of bp and v. */ + for (i = 0; i < kp; ++i) { + kyber_ntt_sqrdmlsh(sp + i * KYBER_N); + } - /* For each polynomial in the vectors. */ - for (i = 0; i < kp; ++i) { - /* Multiply at by sp into bp polynomial. */ - kyber_pointwise_acc_mont(bp + i * KYBER_N, at + i * kp * KYBER_N, sp, - kp); - /* Inverse transform bp polynomial. */ - kyber_invntt(bp + i * KYBER_N); - /* Add errors to bp and reduce. */ - kyber_add_reduce(bp + i * KYBER_N, ep + i * KYBER_N); + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply at by sp into bp polynomial. */ + kyber_pointwise_acc_mont(bp + i * KYBER_N, at + i * kp * KYBER_N, + sp, kp); + /* Inverse transform bp polynomial. */ + kyber_invntt_sqrdmlsh(bp + i * KYBER_N); + /* Add errors to bp and reduce. */ + kyber_add_reduce(bp + i * KYBER_N, ep + i * KYBER_N); + } + + /* Multiply public key by sp into v polynomial. */ + kyber_pointwise_acc_mont(v, pub, sp, kp); + /* Inverse transform v. */ + kyber_invntt_sqrdmlsh(v); } + else +#endif + { + /* Transform sp. All of result used in calculation of bp and v. */ + for (i = 0; i < kp; ++i) { + kyber_ntt(sp + i * KYBER_N); + } - /* Multiply public key by sp into v polynomial. */ - kyber_pointwise_acc_mont(v, pub, sp, kp); - /* Inverse transform v. */ - kyber_invntt(v); + /* For each polynomial in the vectors. */ + for (i = 0; i < kp; ++i) { + /* Multiply at by sp into bp polynomial. */ + kyber_pointwise_acc_mont(bp + i * KYBER_N, at + i * kp * KYBER_N, + sp, kp); + /* Inverse transform bp polynomial. */ + kyber_invntt(bp + i * KYBER_N); + /* Add errors to bp and reduce. */ + kyber_add_reduce(bp + i * KYBER_N, ep + i * KYBER_N); + } + + /* Multiply public key by sp into v polynomial. */ + kyber_pointwise_acc_mont(v, pub, sp, kp); + /* Inverse transform v. */ + kyber_invntt(v); + } /* Add errors and message to v and reduce. */ kyber_add3_reduce(v, epp, m); } @@ -1195,15 +1250,31 @@ void kyber_decapsulate(const sword16* priv, sword16* mp, sword16* bp, { int i; - /* Transform bp. All of result used in calculation of mp. */ - for (i = 0; i < kp; ++i) { - kyber_ntt(bp + i * KYBER_N); +#ifndef WOLFSSL_AARCH64_NO_SQRDMLSH + if (IS_AARCH64_RDM(cpuid_flags)) { + /* Transform bp. All of result used in calculation of mp. */ + for (i = 0; i < kp; ++i) { + kyber_ntt_sqrdmlsh(bp + i * KYBER_N); + } + + /* Multiply private key by bp into mp polynomial. */ + kyber_pointwise_acc_mont(mp, priv, bp, kp); + /* Inverse transform mp. */ + kyber_invntt_sqrdmlsh(mp); } + else +#endif + { + /* Transform bp. All of result used in calculation of mp. */ + for (i = 0; i < kp; ++i) { + kyber_ntt(bp + i * KYBER_N); + } - /* Multiply private key by bp into mp polynomial. */ - kyber_pointwise_acc_mont(mp, priv, bp, kp); - /* Inverse transform mp. */ - kyber_invntt(mp); + /* Multiply private key by bp into mp polynomial. */ + kyber_pointwise_acc_mont(mp, priv, bp, kp); + /* Inverse transform mp. */ + kyber_invntt(mp); + } /* Subtract errors (mp) out of v and reduce into mp. */ kyber_rsub_reduce(mp, v); } diff --git a/wolfssl/wolfcrypt/aes.h b/wolfssl/wolfcrypt/aes.h index d1b71e569c..0f39bed526 100644 --- a/wolfssl/wolfcrypt/aes.h +++ b/wolfssl/wolfcrypt/aes.h @@ -309,6 +309,7 @@ struct Aes { byte use_aes_hw_crypto; #ifdef HAVE_AESGCM byte use_pmull_hw_crypto; + byte use_sha3_hw_crypto; #endif #endif /* __aarch64__ && WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_HW_CRYPTO */ #ifdef WOLF_CRYPTO_CB @@ -841,6 +842,7 @@ WOLFSSL_API int wc_AesEaxFree(AesEax* eax); #if defined(__aarch64__) && defined(WOLFSSL_ARMASM) && \ !defined(WOLFSSL_ARMASM_NO_HW_CRYPTO) + /* GHASH one block of data. * * XOR block into tag and GMULT with H. @@ -848,7 +850,7 @@ WOLFSSL_API int wc_AesEaxFree(AesEax* eax); * @param [in, out] aes AES GCM object. * @param [in] block Block of AAD or cipher text. */ -#define GHASH_ONE_BLOCK(aes, block) \ +#define GHASH_ONE_BLOCK_AARCH64(aes, block) \ do { \ xorbuf(AES_TAG(aes), block, WC_AES_BLOCK_SIZE); \ GMULT_AARCH64(AES_TAG(aes), aes->gcm.H); \ diff --git a/wolfssl/wolfcrypt/wc_kyber.h b/wolfssl/wolfcrypt/wc_kyber.h index 79a03cbd0d..073c383f66 100644 --- a/wolfssl/wolfcrypt/wc_kyber.h +++ b/wolfssl/wolfcrypt/wc_kyber.h @@ -292,6 +292,8 @@ int kyber_cmp_avx2(const byte* a, const byte* b, int sz); #elif defined(__aarch64__) && defined(WOLFSSL_ARMASM) WOLFSSL_LOCAL void kyber_ntt(sword16* r); WOLFSSL_LOCAL void kyber_invntt(sword16* r); +WOLFSSL_LOCAL void kyber_ntt_sqrdmlsh(sword16* r); +WOLFSSL_LOCAL void kyber_invntt_sqrdmlsh(sword16* r); WOLFSSL_LOCAL void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b); WOLFSSL_LOCAL void kyber_basemul_mont_add(sword16* r, const sword16* a, @@ -301,6 +303,7 @@ WOLFSSL_LOCAL void kyber_add3_reduce(sword16* r, const sword16* a, const sword16* b); WOLFSSL_LOCAL void kyber_rsub_reduce(sword16* r, const sword16* a); WOLFSSL_LOCAL void kyber_to_mont(sword16* p); +WOLFSSL_LOCAL void kyber_to_mont_sqrdmlsh(sword16* p); WOLFSSL_LOCAL void kyber_sha3_blocksx3_neon(word64* state); WOLFSSL_LOCAL void kyber_shake128_blocksx3_seed_neon(word64* state, byte* seed); WOLFSSL_LOCAL void kyber_shake256_blocksx3_seed_neon(word64* state, byte* seed);