diff --git a/Cargo.toml b/Cargo.toml index 0898ddd1af..7160e0362c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,6 @@ include = [ "crypto/curve25519/curve25519_64_adx.c", "crypto/curve25519/curve25519_tables.h", "crypto/curve25519/internal.h", - "crypto/fipsmodule/aes/aes_nohw.c", "crypto/fipsmodule/aes/asm/aesni-x86.pl", "crypto/fipsmodule/aes/asm/aesni-x86_64.pl", "crypto/fipsmodule/aes/asm/aesv8-armx.pl", @@ -106,7 +105,6 @@ include = [ "crypto/cipher_extra/asm/chacha20_poly1305_armv8.pl", "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl", "examples/**/*.rs", - "include/ring-core/aes.h", "include/ring-core/arm_arch.h", "include/ring-core/asm_base.h", "include/ring-core/base.h", diff --git a/build.rs b/build.rs index 9d56a3ef05..2c473a3061 100644 --- a/build.rs +++ b/build.rs @@ -53,7 +53,6 @@ const WASM32: &str = "wasm32"; #[rustfmt::skip] const RING_SRCS: &[(&[&str], &str)] = &[ (&[], "crypto/curve25519/curve25519.c"), - (&[], "crypto/fipsmodule/aes/aes_nohw.c"), (&[], "crypto/fipsmodule/bn/montgomery.c"), (&[], "crypto/fipsmodule/bn/montgomery_inv.c"), (&[], "crypto/fipsmodule/ec/ecp_nistz.c"), @@ -869,9 +868,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String { "aes_hw_ctr32_encrypt_blocks", "aes_hw_encrypt", "aes_hw_set_encrypt_key", - "aes_nohw_ctr32_encrypt_blocks", - "aes_nohw_encrypt", - "aes_nohw_set_encrypt_key", "aesni_gcm_decrypt", "aesni_gcm_encrypt", "bn_from_montgomery_in_place", diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c deleted file mode 100644 index 9530cbc9b2..0000000000 --- a/crypto/fipsmodule/aes/aes_nohw.c +++ /dev/null @@ -1,881 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "../../internal.h" - -// This file contains a constant-time implementation of AES, bitsliced with -// 32-bit or 64-bit, operating on two-, four-, and eight-block -// batches, respectively. -// -// This implementation is based on the algorithms described in the following -// references: -// - https://bearssl.org/constanttime.html#aes -// - https://eprint.iacr.org/2009/129.pdf -// - https://eprint.iacr.org/2009/191.pdf - - -// Word operations. -// -// An aes_word_t is the word used for this AES implementation. Throughout this -// file, bits and bytes are ordered little-endian, though "left" and "right" -// shifts match the operations themselves, which makes them reversed in a -// little-endian, left-to-right reading. -// -// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an -// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| -// bits each, each corresponding to a byte in an AES block in column-major -// order (AES's byte order). We refer to these as "logical bytes". Note, in the -// 32-bit and 64-bit implementations, they are smaller than a byte. (The -// contents of a logical byte will be described later.) -// -// MSVC does not support C bit operators on |__m128i|, so the wrapper functions -// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and -// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift -// value ranges from 0 to 15 independent of |aes_word_t| and -// |AES_NOHW_BATCH_SIZE|. -// -// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which -// uses row-major order. Matching the AES order was easier to reason about, and -// we do not have PSHUFB available to arbitrarily permute bytes. - -#if defined(OPENSSL_64_BIT) -typedef uint64_t aes_word_t; -#define AES_NOHW_WORD_SIZE 8 -#define AES_NOHW_BATCH_SIZE 4 -#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) -#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) -#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) -#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) -#else // !OPENSSL_64_BIT -typedef uint32_t aes_word_t; -#define AES_NOHW_WORD_SIZE 4 -#define AES_NOHW_BATCH_SIZE 2 -#define AES_NOHW_ROW0_MASK 0x03030303 -#define AES_NOHW_ROW1_MASK 0x0c0c0c0c -#define AES_NOHW_ROW2_MASK 0x30303030 -#define AES_NOHW_ROW3_MASK 0xc0c0c0c0 -#endif // OPENSSL_64_BIT - -static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { - return a & b; -} - -static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { - return a | b; -} - -static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { - return a ^ b; -} - -static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } - -static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { - return a << (i * AES_NOHW_BATCH_SIZE); -} - -static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { - return a >> (i * AES_NOHW_BATCH_SIZE); -} - -OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), - "batch size does not match word size"); -OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), - "AES_NOHW_WORD_SIZE is incorrect"); - - -// Block representations. -// -// This implementation uses three representations for AES blocks. First, the -// public API represents blocks as uint8_t[16] in the usual way. Second, most -// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. -// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words -// containing bitsliced blocks a, b, c, d, this would be as follows (vertical -// bars divide logical bytes): -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// ... -// -// Finally, an individual block may be stored as an intermediate form in an -// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each -// block, so that block[0]'s ith logical byte contains least-significant -// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of -// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as -// "compacting" the block. Note this is no-op with 128-bit words because then -// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit -// words, one block would be stored in two words: -// -// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... -// -// Observe that the distances between corresponding bits in bitsliced and -// compact bit orders match. If we line up corresponding words of each block, -// the bitsliced and compact representations may be converted by tranposing bits -// in corresponding logical bytes. Continuing the 64-bit example: -// -// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... -// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... -// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// -// Note also that bitwise operations and (logical) byte permutations on an -// |aes_word_t| work equally for the bitsliced and compact words. -// -// We use the compact form in the |AES_KEY| representation to save work -// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists -// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately -// before or after |aes_nohw_transpose|. - -#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) - -// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise -// specified, it is in bitsliced form. -typedef struct { - aes_word_t w[8]; -} AES_NOHW_BATCH; - -// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is -// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| -// |AES_KEY|s so it should not be used as a long-term key representation. -typedef struct { - // keys is an array of batches, one for each round key. Each batch stores - // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. - AES_NOHW_BATCH keys[AES_MAXNR + 1]; -} AES_NOHW_SCHEDULE; - -// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in -// compact form. -static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch, - const aes_word_t in[AES_NOHW_BLOCK_WORDS], - size_t i) { - // Note the words are interleaved. The order comes from |aes_nohw_transpose|. - // If |i| is zero and this is the 64-bit implementation, in[0] contains bits - // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at - // w[4] so that bits 0 and 4 are in the correct position. (In general, bits - // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares - // will be correctly placed.) - dev_assert_secret(i < AES_NOHW_BATCH_SIZE); -#if defined(OPENSSL_64_BIT) - batch->w[i] = in[0]; - batch->w[i + 4] = in[1]; -#else - batch->w[i] = in[0]; - batch->w[i + 2] = in[1]; - batch->w[i + 4] = in[2]; - batch->w[i + 6] = in[3]; -#endif -} - -// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in -// compact form. -static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, - aes_word_t out[AES_NOHW_BLOCK_WORDS], - size_t i) { - dev_assert_secret(i < AES_NOHW_BATCH_SIZE); -#if defined(OPENSSL_64_BIT) - out[0] = batch->w[i]; - out[1] = batch->w[i + 4]; -#else - out[0] = batch->w[i]; - out[1] = batch->w[i + 2]; - out[2] = batch->w[i + 4]; - out[3] = batch->w[i + 6]; -#endif -} - -// aes_nohw_delta_swap returns |a| with bits |a & mask| and -// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. -static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, - aes_word_t shift) { - // See - // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ - aes_word_t b = (a ^ (a >> shift)) & mask; - return a ^ b ^ (b << shift); -} - -// In the 32-bit and 64-bit implementations, a block spans multiple words. -// |aes_nohw_compact_block| must permute bits across different words. First we -// implement |aes_nohw_compact_word| which performs a smaller version of the -// transformation which stays within a single word. -// -// These transformations are generalizations of the output of -// http://programming.sirrida.de/calcperm.php on smaller inputs. -#if defined(OPENSSL_64_BIT) -static inline uint64_t aes_nohw_compact_word(uint64_t a) { -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap8(a); -#endif - // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap - // quartets of those chunks: - // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => - // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); - // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): - // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => - // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); - // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): - // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => - // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); - return a; -} - -static inline uint64_t aes_nohw_uncompact_word(uint64_t a) { - // Reverse the steps of |aes_nohw_uncompact_word|. - a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); - a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); - a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap8(a); -#endif - return a; -} -#else // !OPENSSL_64_BIT -static inline uint32_t aes_nohw_compact_word(uint32_t a) { -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap4(a); -#endif - // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: - // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => - // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 - // Note: 0x00cc = 0b0000_0000_1100_1100 - // 0x00cc << 6 = 0b0011_0011_0000_0000 - a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); - // Now we swap groups of four bits (still numbering by pairs): - // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => - // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 - // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 - a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); - return a; -} - -static inline uint32_t aes_nohw_uncompact_word(uint32_t a) { - // Reverse the steps of |aes_nohw_uncompact_word|. - a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); - a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); -#if defined(RING_BIG_ENDIAN) - a = CRYPTO_bswap4(a); -#endif - return a; -} - -static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, - uint8_t a2, uint8_t a3) { - return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | - ((uint32_t)a3 << 24); -} - -static inline uint8_t lo(uint32_t a) { - return (uint8_t)a; -} - -#endif // OPENSSL_64_BIT - -static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], - const uint8_t in[16]) { - OPENSSL_memcpy(out, in, 16); -#if defined(OPENSSL_64_BIT) - uint64_t a0 = aes_nohw_compact_word(out[0]); - uint64_t a1 = aes_nohw_compact_word(out[1]); - out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); - out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); -#else - uint32_t a0 = aes_nohw_compact_word(out[0]); - uint32_t a1 = aes_nohw_compact_word(out[1]); - uint32_t a2 = aes_nohw_compact_word(out[2]); - uint32_t a3 = aes_nohw_compact_word(out[3]); - // Note clang, when building for ARM Thumb2, will sometimes miscompile - // expressions such as (a0 & 0x0000ff00) << 8, particularly when building - // without optimizations. This bug was introduced in - // https://reviews.llvm.org/rL340261 and fixed in - // https://reviews.llvm.org/rL351310. The following is written to avoid this. - out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); - out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); - out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); - out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); -#endif -} - -static inline void aes_nohw_uncompact_block( - uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { -#if defined(OPENSSL_64_BIT) - uint64_t a0 = in[0]; - uint64_t a1 = in[1]; - uint64_t b0 = - aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); - uint64_t b1 = - aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); - OPENSSL_memcpy(out, &b0, 8); - OPENSSL_memcpy(out + 8, &b1, 8); -#else - uint32_t a0 = in[0]; - uint32_t a1 = in[1]; - uint32_t a2 = in[2]; - uint32_t a3 = in[3]; - // Note clang, when building for ARM Thumb2, will sometimes miscompile - // expressions such as (a0 & 0x0000ff00) << 8, particularly when building - // without optimizations. This bug was introduced in - // https://reviews.llvm.org/rL340261 and fixed in - // https://reviews.llvm.org/rL351310. The following is written to avoid this. - uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3)); - uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)); - uint32_t b2 = - aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)); - uint32_t b3 = - aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)); - b0 = aes_nohw_uncompact_word(b0); - b1 = aes_nohw_uncompact_word(b1); - b2 = aes_nohw_uncompact_word(b2); - b3 = aes_nohw_uncompact_word(b3); - OPENSSL_memcpy(out, &b0, 4); - OPENSSL_memcpy(out + 4, &b1, 4); - OPENSSL_memcpy(out + 8, &b2, 4); - OPENSSL_memcpy(out + 12, &b3, 4); -#endif -} - -// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in -// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and -// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it -// is repeated to the full width of |aes_word_t|. -static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, - uint32_t mask, aes_word_t shift) { -#if defined(OPENSSL_64_BIT) - aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; -#else - aes_word_t mask_w = mask; -#endif - // This is a variation on a delta swap. - aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; - *a ^= swap << shift; - *b ^= swap; -} - -// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides -// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares -// and transposes each square. -static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { - // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). - aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); - -#if AES_NOHW_BATCH_SIZE >= 4 - // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). - aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); -#endif - -#if AES_NOHW_BATCH_SIZE >= 8 - // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). - aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); -#endif -} - -// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. -// |num_blocks| must be at most |AES_NOHW_BATCH|. -static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, - size_t num_blocks) { - // Don't leave unused blocks uninitialized. - OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH)); - debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); - for (size_t i = 0; i < num_blocks; i++) { - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block, in + 16 * i); - aes_nohw_batch_set(out, block, i); - } - - aes_nohw_transpose(out); -} - -// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. -// |num_blocks| must be at most |AES_NOHW_BATCH|. -static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, - const AES_NOHW_BATCH *batch) { - AES_NOHW_BATCH copy = *batch; - aes_nohw_transpose(©); - - debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE); - for (size_t i = 0; i < num_blocks; i++) { - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_batch_get(©, block, i); - aes_nohw_uncompact_block(out + 16 * i, block); - } -} - - -// AES round steps. - -static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, - const AES_NOHW_BATCH *key) { - for (size_t i = 0; i < 8; i++) { - batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); - } -} - -static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/191.pdf, Appendix C. - aes_word_t x0 = batch->w[7]; - aes_word_t x1 = batch->w[6]; - aes_word_t x2 = batch->w[5]; - aes_word_t x3 = batch->w[4]; - aes_word_t x4 = batch->w[3]; - aes_word_t x5 = batch->w[2]; - aes_word_t x6 = batch->w[1]; - aes_word_t x7 = batch->w[0]; - - // Figure 2, the top linear transformation. - aes_word_t y14 = aes_nohw_xor(x3, x5); - aes_word_t y13 = aes_nohw_xor(x0, x6); - aes_word_t y9 = aes_nohw_xor(x0, x3); - aes_word_t y8 = aes_nohw_xor(x0, x5); - aes_word_t t0 = aes_nohw_xor(x1, x2); - aes_word_t y1 = aes_nohw_xor(t0, x7); - aes_word_t y4 = aes_nohw_xor(y1, x3); - aes_word_t y12 = aes_nohw_xor(y13, y14); - aes_word_t y2 = aes_nohw_xor(y1, x0); - aes_word_t y5 = aes_nohw_xor(y1, x6); - aes_word_t y3 = aes_nohw_xor(y5, y8); - aes_word_t t1 = aes_nohw_xor(x4, y12); - aes_word_t y15 = aes_nohw_xor(t1, x5); - aes_word_t y20 = aes_nohw_xor(t1, x1); - aes_word_t y6 = aes_nohw_xor(y15, x7); - aes_word_t y10 = aes_nohw_xor(y15, t0); - aes_word_t y11 = aes_nohw_xor(y20, y9); - aes_word_t y7 = aes_nohw_xor(x7, y11); - aes_word_t y17 = aes_nohw_xor(y10, y11); - aes_word_t y19 = aes_nohw_xor(y10, y8); - aes_word_t y16 = aes_nohw_xor(t0, y11); - aes_word_t y21 = aes_nohw_xor(y13, y16); - aes_word_t y18 = aes_nohw_xor(x0, y16); - - // Figure 3, the middle non-linear section. - aes_word_t t2 = aes_nohw_and(y12, y15); - aes_word_t t3 = aes_nohw_and(y3, y6); - aes_word_t t4 = aes_nohw_xor(t3, t2); - aes_word_t t5 = aes_nohw_and(y4, x7); - aes_word_t t6 = aes_nohw_xor(t5, t2); - aes_word_t t7 = aes_nohw_and(y13, y16); - aes_word_t t8 = aes_nohw_and(y5, y1); - aes_word_t t9 = aes_nohw_xor(t8, t7); - aes_word_t t10 = aes_nohw_and(y2, y7); - aes_word_t t11 = aes_nohw_xor(t10, t7); - aes_word_t t12 = aes_nohw_and(y9, y11); - aes_word_t t13 = aes_nohw_and(y14, y17); - aes_word_t t14 = aes_nohw_xor(t13, t12); - aes_word_t t15 = aes_nohw_and(y8, y10); - aes_word_t t16 = aes_nohw_xor(t15, t12); - aes_word_t t17 = aes_nohw_xor(t4, t14); - aes_word_t t18 = aes_nohw_xor(t6, t16); - aes_word_t t19 = aes_nohw_xor(t9, t14); - aes_word_t t20 = aes_nohw_xor(t11, t16); - aes_word_t t21 = aes_nohw_xor(t17, y20); - aes_word_t t22 = aes_nohw_xor(t18, y19); - aes_word_t t23 = aes_nohw_xor(t19, y21); - aes_word_t t24 = aes_nohw_xor(t20, y18); - aes_word_t t25 = aes_nohw_xor(t21, t22); - aes_word_t t26 = aes_nohw_and(t21, t23); - aes_word_t t27 = aes_nohw_xor(t24, t26); - aes_word_t t28 = aes_nohw_and(t25, t27); - aes_word_t t29 = aes_nohw_xor(t28, t22); - aes_word_t t30 = aes_nohw_xor(t23, t24); - aes_word_t t31 = aes_nohw_xor(t22, t26); - aes_word_t t32 = aes_nohw_and(t31, t30); - aes_word_t t33 = aes_nohw_xor(t32, t24); - aes_word_t t34 = aes_nohw_xor(t23, t33); - aes_word_t t35 = aes_nohw_xor(t27, t33); - aes_word_t t36 = aes_nohw_and(t24, t35); - aes_word_t t37 = aes_nohw_xor(t36, t34); - aes_word_t t38 = aes_nohw_xor(t27, t36); - aes_word_t t39 = aes_nohw_and(t29, t38); - aes_word_t t40 = aes_nohw_xor(t25, t39); - aes_word_t t41 = aes_nohw_xor(t40, t37); - aes_word_t t42 = aes_nohw_xor(t29, t33); - aes_word_t t43 = aes_nohw_xor(t29, t40); - aes_word_t t44 = aes_nohw_xor(t33, t37); - aes_word_t t45 = aes_nohw_xor(t42, t41); - aes_word_t z0 = aes_nohw_and(t44, y15); - aes_word_t z1 = aes_nohw_and(t37, y6); - aes_word_t z2 = aes_nohw_and(t33, x7); - aes_word_t z3 = aes_nohw_and(t43, y16); - aes_word_t z4 = aes_nohw_and(t40, y1); - aes_word_t z5 = aes_nohw_and(t29, y7); - aes_word_t z6 = aes_nohw_and(t42, y11); - aes_word_t z7 = aes_nohw_and(t45, y17); - aes_word_t z8 = aes_nohw_and(t41, y10); - aes_word_t z9 = aes_nohw_and(t44, y12); - aes_word_t z10 = aes_nohw_and(t37, y3); - aes_word_t z11 = aes_nohw_and(t33, y4); - aes_word_t z12 = aes_nohw_and(t43, y13); - aes_word_t z13 = aes_nohw_and(t40, y5); - aes_word_t z14 = aes_nohw_and(t29, y2); - aes_word_t z15 = aes_nohw_and(t42, y9); - aes_word_t z16 = aes_nohw_and(t45, y14); - aes_word_t z17 = aes_nohw_and(t41, y8); - - // Figure 4, bottom linear transformation. - aes_word_t t46 = aes_nohw_xor(z15, z16); - aes_word_t t47 = aes_nohw_xor(z10, z11); - aes_word_t t48 = aes_nohw_xor(z5, z13); - aes_word_t t49 = aes_nohw_xor(z9, z10); - aes_word_t t50 = aes_nohw_xor(z2, z12); - aes_word_t t51 = aes_nohw_xor(z2, z5); - aes_word_t t52 = aes_nohw_xor(z7, z8); - aes_word_t t53 = aes_nohw_xor(z0, z3); - aes_word_t t54 = aes_nohw_xor(z6, z7); - aes_word_t t55 = aes_nohw_xor(z16, z17); - aes_word_t t56 = aes_nohw_xor(z12, t48); - aes_word_t t57 = aes_nohw_xor(t50, t53); - aes_word_t t58 = aes_nohw_xor(z4, t46); - aes_word_t t59 = aes_nohw_xor(z3, t54); - aes_word_t t60 = aes_nohw_xor(t46, t57); - aes_word_t t61 = aes_nohw_xor(z14, t57); - aes_word_t t62 = aes_nohw_xor(t52, t58); - aes_word_t t63 = aes_nohw_xor(t49, t58); - aes_word_t t64 = aes_nohw_xor(z4, t59); - aes_word_t t65 = aes_nohw_xor(t61, t62); - aes_word_t t66 = aes_nohw_xor(z1, t63); - aes_word_t s0 = aes_nohw_xor(t59, t63); - aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); - aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); - aes_word_t t67 = aes_nohw_xor(t64, t65); - aes_word_t s3 = aes_nohw_xor(t53, t66); - aes_word_t s4 = aes_nohw_xor(t51, t66); - aes_word_t s5 = aes_nohw_xor(t47, t65); - aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); - aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); - - batch->w[0] = s7; - batch->w[1] = s6; - batch->w[2] = s5; - batch->w[3] = s4; - batch->w[4] = s3; - batch->w[5] = s2; - batch->w[6] = s1; - batch->w[7] = s0; -} - -// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated -// to the right by |n|. This is a macro because |aes_nohw_shift_*| require -// constant shift counts in the SSE2 implementation. -#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ - (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ - aes_nohw_shift_left((v), 16 - (n)*4))) - -static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { - for (size_t i = 0; i < 8; i++) { - aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); - aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); - aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); - aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); - row1 = aes_nohw_rotate_cols_right(row1, 1); - row2 = aes_nohw_rotate_cols_right(row2, 2); - row3 = aes_nohw_rotate_cols_right(row3, 3); - batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); - } -} - -// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated -// down by one. -static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { -#if defined(OPENSSL_64_BIT) - return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | - ((v << 12) & UINT64_C(0xf000f000f000f000)); -#else - return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); -#endif -} - -// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated -// by two. -static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { -#if defined(OPENSSL_64_BIT) - return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | - ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); -#else - return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); -#endif -} - -static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. - aes_word_t a0 = batch->w[0]; - aes_word_t a1 = batch->w[1]; - aes_word_t a2 = batch->w[2]; - aes_word_t a3 = batch->w[3]; - aes_word_t a4 = batch->w[4]; - aes_word_t a5 = batch->w[5]; - aes_word_t a6 = batch->w[6]; - aes_word_t a7 = batch->w[7]; - - aes_word_t r0 = aes_nohw_rotate_rows_down(a0); - aes_word_t a0_r0 = aes_nohw_xor(a0, r0); - aes_word_t r1 = aes_nohw_rotate_rows_down(a1); - aes_word_t a1_r1 = aes_nohw_xor(a1, r1); - aes_word_t r2 = aes_nohw_rotate_rows_down(a2); - aes_word_t a2_r2 = aes_nohw_xor(a2, r2); - aes_word_t r3 = aes_nohw_rotate_rows_down(a3); - aes_word_t a3_r3 = aes_nohw_xor(a3, r3); - aes_word_t r4 = aes_nohw_rotate_rows_down(a4); - aes_word_t a4_r4 = aes_nohw_xor(a4, r4); - aes_word_t r5 = aes_nohw_rotate_rows_down(a5); - aes_word_t a5_r5 = aes_nohw_xor(a5, r5); - aes_word_t r6 = aes_nohw_rotate_rows_down(a6); - aes_word_t a6_r6 = aes_nohw_xor(a6, r6); - aes_word_t r7 = aes_nohw_rotate_rows_down(a7); - aes_word_t a7_r7 = aes_nohw_xor(a7, r7); - - batch->w[0] = - aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); - batch->w[1] = - aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), - aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); - batch->w[2] = - aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); - batch->w[3] = - aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), - aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); - batch->w[4] = - aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), - aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); - batch->w[5] = - aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); - batch->w[6] = - aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); - batch->w[7] = - aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); -} - -static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, - size_t num_rounds, AES_NOHW_BATCH *batch) { - aes_nohw_add_round_key(batch, &key->keys[0]); - for (size_t i = 1; i < num_rounds; i++) { - aes_nohw_sub_bytes(batch); - aes_nohw_shift_rows(batch); - aes_nohw_mix_columns(batch); - aes_nohw_add_round_key(batch, &key->keys[i]); - } - aes_nohw_sub_bytes(batch); - aes_nohw_shift_rows(batch); - aes_nohw_add_round_key(batch, &key->keys[num_rounds]); -} - -// Key schedule. - -static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, - const AES_KEY *key) { - for (size_t i = 0; i <= key->rounds; i++) { - // Copy the round key into each block in the batch. - for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { - aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; - OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16); - aes_nohw_batch_set(&out->keys[i], tmp, j); - } - aes_nohw_transpose(&out->keys[i]); - } -} - -static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, - 0x20, 0x40, 0x80, 0x1b, 0x36}; - -// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in -// |rcon|, stored in a |aes_word_t|. -static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { - rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); - return ((aes_word_t)rcon); -} - -static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], - const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { - AES_NOHW_BATCH batch; - OPENSSL_memset(&batch, 0, sizeof(batch)); - aes_nohw_batch_set(&batch, in, 0); - aes_nohw_transpose(&batch); - aes_nohw_sub_bytes(&batch); - aes_nohw_transpose(&batch); - aes_nohw_batch_get(&batch, out, 0); -} - -static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { - key->rounds = 10; - - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block, in); - OPENSSL_memcpy(key->rd_key, block, 16); - - for (size_t i = 1; i <= 10; i++) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block); - uint8_t rcon = aes_nohw_rcon[i - 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate |rcon| and the transformed word into the first word. - block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); - block[j] = aes_nohw_xor( - block[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. Note this is reordered from the usual - // formulation to avoid needing masks. - aes_word_t v = block[j]; - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); - } - OPENSSL_memcpy(key->rd_key + 4 * i, block, 16); - } -} - -static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { - key->rounds = 14; - - // Each key schedule iteration produces two round keys. - aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block1, in); - OPENSSL_memcpy(key->rd_key, block1, 16); - - aes_nohw_compact_block(block2, in + 16); - OPENSSL_memcpy(key->rd_key + 4, block2, 16); - - for (size_t i = 2; i <= 14; i += 2) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block2); - uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate |rcon| and the transformed word into the first word. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); - block1[j] = aes_nohw_xor( - block1[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. - aes_word_t v = block1[j]; - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); - } - OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16); - - if (i == 14) { - break; - } - - aes_nohw_sub_block(sub, block1); - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate the transformed word into the first word. - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); - // Propagate to the remaining words. - aes_word_t v = block2[j]; - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); - } - OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16); - } -} - - -// External API. - -int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, - AES_KEY *aeskey) { - switch (bits) { - case 128: - aes_nohw_setup_key_128(aeskey, key); - return 0; - case 256: - aes_nohw_setup_key_256(aeskey, key); - return 0; - } - return 1; -} - -void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); -} - -static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], - const uint8_t b[16]) { - for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { - aes_word_t x, y; - OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t)); - OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t)); - x = aes_nohw_xor(x, y); - OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t)); - } -} - -void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, - size_t blocks, const AES_KEY *key, - const uint8_t ivec[16]) { - if (blocks == 0) { - return; - } - - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - - // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. - alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; - alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; - for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - OPENSSL_memcpy(ivs + 16 * i, ivec, 16); - } - - uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); - for (;;) { - // Update counters. - for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i); - } - - size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, ivs, todo); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(enc_ivs, todo, &batch); - - for (size_t i = 0; i < todo; i++) { - aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); - } - - blocks -= todo; - if (blocks == 0) { - break; - } - - in += 16 * AES_NOHW_BATCH_SIZE; - out += 16 * AES_NOHW_BATCH_SIZE; - ctr += AES_NOHW_BATCH_SIZE; - } -} diff --git a/crypto/internal.h b/crypto/internal.h index d56735eab6..63b0b8d68b 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -378,18 +378,6 @@ static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) { static inline uint32_t CRYPTO_bswap4(uint32_t x) { return __builtin_bswap32(x); } - -static inline uint64_t CRYPTO_bswap8(uint64_t x) { - return __builtin_bswap64(x); -} -#elif defined(_MSC_VER) -#pragma warning(push, 3) -#include -#pragma warning(pop) -#pragma intrinsic(_byteswap_ulong) -static inline uint32_t CRYPTO_bswap4(uint32_t x) { - return _byteswap_ulong(x); -} #endif #if !defined(RING_CORE_NOSTDLIBINC) @@ -457,23 +445,6 @@ static inline void CRYPTO_store_u32_le(void *out, uint32_t v) { OPENSSL_memcpy(out, &v, sizeof(v)); } -static inline uint32_t CRYPTO_load_u32_be(const void *in) { - uint32_t v; - OPENSSL_memcpy(&v, in, sizeof(v)); -#if !defined(RING_BIG_ENDIAN) - return CRYPTO_bswap4(v); -#else - return v; -#endif -} - -static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { -#if !defined(RING_BIG_ENDIAN) - v = CRYPTO_bswap4(v); -#endif - OPENSSL_memcpy(out, &v, sizeof(v)); -} - // Runtime CPU feature support #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) diff --git a/include/ring-core/aes.h b/include/ring-core/aes.h deleted file mode 100644 index 5b5130dad7..0000000000 --- a/include/ring-core/aes.h +++ /dev/null @@ -1,68 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#ifndef OPENSSL_HEADER_AES_H -#define OPENSSL_HEADER_AES_H - -#include - -// Raw AES functions. - - -// AES_MAXNR is the maximum number of AES rounds. -#define AES_MAXNR 14 - -// aes_key_st should be an opaque type, but EVP requires that the size be -// known. -struct aes_key_st { - uint32_t rd_key[4 * (AES_MAXNR + 1)]; - unsigned rounds; -}; -typedef struct aes_key_st AES_KEY; - -#endif // OPENSSL_HEADER_AES_H diff --git a/src/aead/aes.rs b/src/aead/aes.rs index 15802da29d..e3b09fd250 100644 --- a/src/aead/aes.rs +++ b/src/aead/aes.rs @@ -12,6 +12,8 @@ // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +mod aes_nohw; + use super::{nonce::Nonce, quic::Sample}; use crate::{ bits::BitLength, @@ -177,7 +179,7 @@ impl Key { cpu_features: cpu::Features, ) -> Result { let mut key = AES_KEY { - rd_key: [0u32; 4 * (MAX_ROUNDS + 1)], + rd_key: [[0u32; 4]; MAX_ROUNDS + 1], rounds: 0, }; @@ -203,9 +205,7 @@ impl Key { // SAFETY: `aes_nohw_set_encrypt_key` satisfies the `set_encrypt_key!` // contract. - Implementation::NOHW => unsafe { - set_encrypt_key!(aes_nohw_set_encrypt_key, bytes, &mut key, cpu_features)?; - }, + Implementation::NOHW => aes_nohw::set_encrypt_key(&mut key, bytes), }; Ok(Self { inner: key }) @@ -225,7 +225,11 @@ impl Key { ))] Implementation::VPAES_BSAES => encrypt_block!(vpaes_encrypt, a, self), - Implementation::NOHW => encrypt_block!(aes_nohw_encrypt, a, self), + Implementation::NOHW => { + let mut in_out = a; + aes_nohw::encrypt_block(&self.inner, &mut in_out); + in_out + } } } @@ -327,16 +331,7 @@ impl Key { // above, as required by `aes_nohw_ctr32_encrypt_blocks`. // * `aes_nohw_ctr32_encrypt_blocks` satisfies the contract for // `ctr32_encrypt_blocks`. - Implementation::NOHW => unsafe { - ctr32_encrypt_blocks!( - aes_nohw_ctr32_encrypt_blocks, - in_out, - src, - &self.inner, - ctr, - cpu_features - ) - }, + Implementation::NOHW => aes_nohw::ctr32_encrypt_within(&self.inner, in_out, src, ctr), } } @@ -358,15 +353,13 @@ impl Key { } } -// Keep this in sync with AES_KEY in aes.h. #[repr(C)] #[derive(Clone)] pub(super) struct AES_KEY { - pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)], + pub rd_key: [[u32; 4]; MAX_ROUNDS + 1], pub rounds: c::uint, } -// Keep this in sync with `AES_MAXNR` in aes.h. const MAX_ROUNDS: usize = 14; pub const AES_128_KEY_LEN: usize = 128 / 8; @@ -399,6 +392,10 @@ impl Counter { let new_value = old_value + increment_by; [*c0, *c1, *c2, *c3] = u32::to_be_bytes(new_value); } + + pub(super) fn as_bytes_less_safe(&self) -> [u8; 16] { + self.0 + } } /// The IV for a single block encryption. @@ -510,7 +507,7 @@ unsafe fn bsaes_ctr32_encrypt_blocks_with_vpaes_key( } let mut bsaes_key = AES_KEY { - rd_key: [0u32; 4 * (MAX_ROUNDS + 1)], + rd_key: [[0u32; 4]; MAX_ROUNDS + 1], rounds: 0, }; // SAFETY: diff --git a/src/aead/aes/aes_nohw.rs b/src/aead/aes/aes_nohw.rs new file mode 100644 index 0000000000..77cc97ddbf --- /dev/null +++ b/src/aead/aes/aes_nohw.rs @@ -0,0 +1,786 @@ +// Copyright (c) 2019, Google Inc. +// Portions Copyright 2024 Brian Smith. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use super::{Counter, KeyBytes, AES_KEY, BLOCK_LEN, MAX_ROUNDS}; +use crate::{ + constant_time, + polyfill::{self, usize_from_u32, ArraySplitMap as _}, +}; +use cfg_if::cfg_if; +use core::{array, ops::RangeFrom}; + +type Word = constant_time::Word; +const WORD_SIZE: usize = core::mem::size_of::(); +const BATCH_SIZE: usize = WORD_SIZE / 2; +#[allow(clippy::cast_possible_truncation)] +const BATCH_SIZE_U32: u32 = BATCH_SIZE as u32; + +const BLOCK_WORDS: usize = 16 / WORD_SIZE; + +cfg_if! { + if #[cfg(target_pointer_width = "64")] { + const ROW0_MASK: Word = 0x000f000f000f000f; + const ROW1_MASK: Word = 0x00f000f000f000f0; + const ROW2_MASK: Word = 0x0f000f000f000f00; + const ROW3_MASK: Word = 0xf000f000f000f000; + } else if #[cfg(target_pointer_width = "32")] { + const ROW0_MASK: Word = 0x03030303; + const ROW1_MASK: Word = 0x0c0c0c0c; + const ROW2_MASK: Word = 0x30303030; + const ROW3_MASK: Word = 0xc0c0c0c0; + } +} + +#[inline(always)] +fn and(a: Word, b: Word) -> Word { + a & b +} + +#[inline(always)] +fn or(a: Word, b: Word) -> Word { + a | b +} + +#[inline(always)] +fn xor(a: Word, b: Word) -> Word { + a ^ b +} + +#[inline(always)] +fn not(a: Word) -> Word { + !a +} + +#[inline(always)] +fn shift_left(a: Word) -> Word { + a << (I * BATCH_SIZE_U32) +} + +#[inline(always)] +fn shift_right(a: Word) -> Word { + a >> (I * BATCH_SIZE_U32) +} + +// aes_nohw_delta_swap returns |a| with bits |a & mask| and +// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. +#[inline(always)] +fn delta_swap(a: Word) -> Word { + // See + // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ + let b = (a ^ (a >> SHIFT)) & MASK; + a ^ b ^ (b << SHIFT) +} + +// In the 32-bit and 64-bit implementations, a block spans multiple words. +// |aes_nohw_compact_block| must permute bits across different words. First we +// implement |aes_nohw_compact_word| which performs a smaller version of the +// transformation which stays within a single word. +// +// These transformations are generalizations of the output of +// http://programming.sirrida.de/calcperm.php on smaller inputs. +#[inline(always)] +fn compact_word(a: Word) -> Word { + let a = Word::from_le(a); + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap + // quartets of those chunks: + // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => + // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 + let a = delta_swap::<0x00f000f000f000f0, 4>(a); + // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): + // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => + // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 + let a = delta_swap::<0x0000ff000000ff00, 8>(a); + // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): + // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => + // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 + delta_swap::<0x00000000ffff0000, 16>(a) + } else if #[cfg(target_pointer_width = "32")] { + // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: + // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => + // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 + // Note: 0x00cc = 0b0000_0000_1100_1100 + // 0x00cc << 6 = 0b0011_0011_0000_0000 + let a = delta_swap::<0x00cc00cc, 6>(a); + // Now we swap groups of four bits (still numbering by pairs): + // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => + // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 + // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 + delta_swap::<0x0000f0f0, 12>(a) + } else { + unimplemented!() + } + } +} + +#[inline(always)] +fn uncompact_word(a: Word) -> Word { + #[cfg(target_pointer_width = "64")] + let r = { + // Reverse the steps of |aes_nohw_uncompact_word|. + let a = delta_swap::<0x00000000ffff0000, 16>(a); + let a = delta_swap::<0x0000ff000000ff00, 8>(a); + delta_swap::<0x00f000f000f000f0, 4>(a) + }; + + #[cfg(target_pointer_width = "32")] + let r = { + let a = delta_swap::<0x0000f0f0, 12>(a); + delta_swap::<0x00cc00cc, 6>(a) + }; + + Word::to_le(r) +} + +fn compact_block(input: &[u8; 16]) -> [Word; BLOCK_WORDS] { + let out: [Word; BLOCK_WORDS] = unsafe { core::mem::transmute(*input) }; + let a0 = compact_word(out[0]); + let a1 = compact_word(out[1]); + + #[cfg(target_pointer_width = "64")] + let r = [ + (a0 & 0x00000000ffffffff) | (a1 << 32), + (a1 & 0xffffffff00000000) | (a0 >> 32), + ]; + + #[cfg(target_pointer_width = "32")] + let r = { + let a2 = compact_word(out[2]); + let a3 = compact_word(out[3]); + // Note clang, when building for ARM Thumb2, will sometimes miscompile + // expressions such as (a0 & 0x0000ff00) << 8, particularly when building + // without optimizations. This bug was introduced in + // https://reviews.llvm.org/rL340261 and fixed in + // https://reviews.llvm.org/rL351310. The following is written to avoid this. + [ + Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]), + Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]), + Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]), + Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]), + ] + }; + + r +} + +fn uncompact_block(out: &mut [u8; BLOCK_LEN], input: &[Word; BLOCK_WORDS]) { + let a0 = input[0]; + let a1 = input[1]; + + #[cfg(target_pointer_width = "64")] + let [b0, b1] = { + [ + (a0 & 0x00000000ffffffff) | (a1 << 32), + (a1 & 0xffffffff00000000) | (a0 >> 32), + ] + }; + + #[cfg(target_pointer_width = "32")] + let [b0, b1, b2, b3] = { + let a2 = input[2]; + let a3 = input[3]; + + // Note clang, when building for ARM Thumb2, will sometimes miscompile + // expressions such as (a0 & 0x0000ff00) << 8, particularly when building + // without optimizations. This bug was introduced in + // https://reviews.llvm.org/rL340261 and fixed in + // https://reviews.llvm.org/rL351310. The following is written to avoid this. + let b0 = Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]); + let b1 = Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]); + let b2 = Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]); + let b3 = Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]); + [b0, b1, b2, b3] + }; + + let b0 = uncompact_word(b0); + let b1 = uncompact_word(b1); + + #[cfg(target_pointer_width = "32")] + let (b2, b3) = (uncompact_word(b2), uncompact_word(b3)); + + let (out, _) = polyfill::slice::as_chunks_mut(out); + out[0] = Word::to_ne_bytes(b0); + out[1] = Word::to_ne_bytes(b1); + + #[cfg(target_pointer_width = "32")] + { + out[2] = Word::to_ne_bytes(b2); + out[3] = Word::to_ne_bytes(b3); + } +} + +#[cfg(target_pointer_width = "32")] +#[inline(always)] +fn lo(w: Word) -> u8 { + w as u8 +} + +// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in +// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and +// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it +// is repeated to the full width of |aes_word_t|. +fn swap_bits( + w: &mut [Word; 8], +) { + // TODO: const MASK: Word = ... + let mask = Word::from_ne_bytes([MASK_BYTE; core::mem::size_of::()]); + + // This is a variation on a delta swap. + let swap = ((w[A] >> SHIFT) ^ w[B]) & mask; + w[A] ^= swap << SHIFT; + w[B] ^= swap; +} + +// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise +// specified, it is in bitsliced form. +#[repr(C)] +struct Batch { + w: [Word; 8], +} + +impl Batch { + // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. + // |num_blocks| must be at most |AES_NOHW_BATCH|. + fn from_bytes(input: &[[u8; BLOCK_LEN]]) -> Self { + let mut r = Self { + w: Default::default(), + }; + input.iter().enumerate().for_each(|(i, input)| { + let block = compact_block(input); + r.set(&block, i); + }); + r.transpose(); + r + } + + // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in + // compact form. + fn set(&mut self, input: &[Word; BLOCK_WORDS], i: usize) { + assert!(i < self.w.len()); + + // Note the words are interleaved. The order comes from |aes_nohw_transpose|. + // If |i| is zero and this is the 64-bit implementation, in[0] contains bits + // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at + // w[4] so that bits 0 and 4 are in the correct position. (In general, bits + // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares + // will be correctly placed.) + cfg_if! { + if #[cfg(target_pointer_width = "64")] { + self.w[i] = input[0]; + self.w[i + 4] = input[1]; + } else if #[cfg(target_pointer_width = "32")] { + self.w[i] = input[0]; + self.w[i + 2] = input[1]; + self.w[i + 4] = input[2]; + self.w[i + 6] = input[3]; + } else { + todo!() + } + } + } + + // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in + // compact form. + fn get(&self, i: usize) -> [Word; BLOCK_WORDS] { + assert!(i < self.w.len()); + array::from_fn(|j| { + #[cfg(target_pointer_width = "64")] + const STRIDE: usize = 4; + #[cfg(target_pointer_width = "32")] + const STRIDE: usize = 2; + + self.w[i + (j * STRIDE)] + }) + } +} + +// AES round steps. +impl Batch { + fn sub_bytes(&mut self) { + // See https://eprint.iacr.org/2009/191.pdf, Appendix C. + let x0 = self.w[7]; + let x1 = self.w[6]; + let x2 = self.w[5]; + let x3 = self.w[4]; + let x4 = self.w[3]; + let x5 = self.w[2]; + let x6 = self.w[1]; + let x7 = self.w[0]; + + // Figure 2, the top linear transformation. + let y14 = xor(x3, x5); + let y13 = xor(x0, x6); + let y9 = xor(x0, x3); + let y8 = xor(x0, x5); + let t0 = xor(x1, x2); + let y1 = xor(t0, x7); + let y4 = xor(y1, x3); + let y12 = xor(y13, y14); + let y2 = xor(y1, x0); + let y5 = xor(y1, x6); + let y3 = xor(y5, y8); + let t1 = xor(x4, y12); + let y15 = xor(t1, x5); + let y20 = xor(t1, x1); + let y6 = xor(y15, x7); + let y10 = xor(y15, t0); + let y11 = xor(y20, y9); + let y7 = xor(x7, y11); + let y17 = xor(y10, y11); + let y19 = xor(y10, y8); + let y16 = xor(t0, y11); + let y21 = xor(y13, y16); + let y18 = xor(x0, y16); + + // Figure 3, the middle non-linear section. + let t2 = and(y12, y15); + let t3 = and(y3, y6); + let t4 = xor(t3, t2); + let t5 = and(y4, x7); + let t6 = xor(t5, t2); + let t7 = and(y13, y16); + let t8 = and(y5, y1); + let t9 = xor(t8, t7); + let t10 = and(y2, y7); + let t11 = xor(t10, t7); + let t12 = and(y9, y11); + let t13 = and(y14, y17); + let t14 = xor(t13, t12); + let t15 = and(y8, y10); + let t16 = xor(t15, t12); + let t17 = xor(t4, t14); + let t18 = xor(t6, t16); + let t19 = xor(t9, t14); + let t20 = xor(t11, t16); + let t21 = xor(t17, y20); + let t22 = xor(t18, y19); + let t23 = xor(t19, y21); + let t24 = xor(t20, y18); + let t25 = xor(t21, t22); + let t26 = and(t21, t23); + let t27 = xor(t24, t26); + let t28 = and(t25, t27); + let t29 = xor(t28, t22); + let t30 = xor(t23, t24); + let t31 = xor(t22, t26); + let t32 = and(t31, t30); + let t33 = xor(t32, t24); + let t34 = xor(t23, t33); + let t35 = xor(t27, t33); + let t36 = and(t24, t35); + let t37 = xor(t36, t34); + let t38 = xor(t27, t36); + let t39 = and(t29, t38); + let t40 = xor(t25, t39); + let t41 = xor(t40, t37); + let t42 = xor(t29, t33); + let t43 = xor(t29, t40); + let t44 = xor(t33, t37); + let t45 = xor(t42, t41); + let z0 = and(t44, y15); + let z1 = and(t37, y6); + let z2 = and(t33, x7); + let z3 = and(t43, y16); + let z4 = and(t40, y1); + let z5 = and(t29, y7); + let z6 = and(t42, y11); + let z7 = and(t45, y17); + let z8 = and(t41, y10); + let z9 = and(t44, y12); + let z10 = and(t37, y3); + let z11 = and(t33, y4); + let z12 = and(t43, y13); + let z13 = and(t40, y5); + let z14 = and(t29, y2); + let z15 = and(t42, y9); + let z16 = and(t45, y14); + let z17 = and(t41, y8); + + // Figure 4, bottom linear transformation. + let t46 = xor(z15, z16); + let t47 = xor(z10, z11); + let t48 = xor(z5, z13); + let t49 = xor(z9, z10); + let t50 = xor(z2, z12); + let t51 = xor(z2, z5); + let t52 = xor(z7, z8); + let t53 = xor(z0, z3); + let t54 = xor(z6, z7); + let t55 = xor(z16, z17); + let t56 = xor(z12, t48); + let t57 = xor(t50, t53); + let t58 = xor(z4, t46); + let t59 = xor(z3, t54); + let t60 = xor(t46, t57); + let t61 = xor(z14, t57); + let t62 = xor(t52, t58); + let t63 = xor(t49, t58); + let t64 = xor(z4, t59); + let t65 = xor(t61, t62); + let t66 = xor(z1, t63); + let s0 = xor(t59, t63); + let s6 = xor(t56, not(t62)); + let s7 = xor(t48, not(t60)); + let t67 = xor(t64, t65); + let s3 = xor(t53, t66); + let s4 = xor(t51, t66); + let s5 = xor(t47, t65); + let s1 = xor(t64, not(s3)); + let s2 = xor(t55, not(t67)); + + self.w[0] = s7; + self.w[1] = s6; + self.w[2] = s5; + self.w[3] = s4; + self.w[4] = s3; + self.w[5] = s2; + self.w[6] = s1; + self.w[7] = s0; + } + + fn add_round_key(&mut self, key: &Batch) { + constant_time::xor_assign_at_start(&mut self.w, &key.w) + } + + #[inline(always)] + fn rotate_cols_right( + v: Word, + ) -> Word { + or( + shift_right::(v), + shift_left::(v), + ) + } +} + +// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated +// to the right by |n|. This is a macro because |aes_nohw_shift_*| require +// constant shift counts in the SSE2 implementation. +// TODO(MSRV feature(generic_const_exprs)): Replace this. +macro_rules! rotate_cols_right { + ( Self::rotate_cols_right::<$N:literal>($v:expr) ) => { + Self::rotate_cols_right::<{ $N * 4 }, { 16 - ($N * 4) }>($v) + }; +} + +impl Batch { + fn shift_rows(&mut self) { + self.w.iter_mut().for_each(|w| { + let row0 = and(*w, ROW0_MASK); + let row1 = and(*w, ROW1_MASK); + let row2 = and(*w, ROW2_MASK); + let row3 = and(*w, ROW3_MASK); + let row1 = rotate_cols_right!(Self::rotate_cols_right::<1>(row1)); + let row2 = rotate_cols_right!(Self::rotate_cols_right::<2>(row2)); + let row3 = rotate_cols_right!(Self::rotate_cols_right::<3>(row3)); + *w = or(or(row0, row1), or(row2, row3)); + }); + } + + fn mix_columns(&mut self) { + // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. + let a0 = self.w[0]; + let a1 = self.w[1]; + let a2 = self.w[2]; + let a3 = self.w[3]; + let a4 = self.w[4]; + let a5 = self.w[5]; + let a6 = self.w[6]; + let a7 = self.w[7]; + + let r0 = rotate_rows_down(a0); + let a0_r0 = xor(a0, r0); + let r1 = rotate_rows_down(a1); + let a1_r1 = xor(a1, r1); + let r2 = rotate_rows_down(a2); + let a2_r2 = xor(a2, r2); + let r3 = rotate_rows_down(a3); + let a3_r3 = xor(a3, r3); + let r4 = rotate_rows_down(a4); + let a4_r4 = xor(a4, r4); + let r5 = rotate_rows_down(a5); + let a5_r5 = xor(a5, r5); + let r6 = rotate_rows_down(a6); + let a6_r6 = xor(a6, r6); + let r7 = rotate_rows_down(a7); + let a7_r7 = xor(a7, r7); + + self.w[0] = xor(xor(a7_r7, r0), rotate_rows_twice(a0_r0)); + self.w[1] = xor(xor(a0_r0, a7_r7), xor(r1, rotate_rows_twice(a1_r1))); + self.w[2] = xor(xor(a1_r1, r2), rotate_rows_twice(a2_r2)); + self.w[3] = xor(xor(a2_r2, a7_r7), xor(r3, rotate_rows_twice(a3_r3))); + self.w[4] = xor(xor(a3_r3, a7_r7), xor(r4, rotate_rows_twice(a4_r4))); + self.w[5] = xor(xor(a4_r4, r5), rotate_rows_twice(a5_r5)); + self.w[6] = xor(xor(a5_r5, r6), rotate_rows_twice(a6_r6)); + self.w[7] = xor(xor(a6_r6, r7), rotate_rows_twice(a7_r7)); + } + + // aes_nohw_from_batch writes the first |num_blocks| blocks in |batch| to |out|. + // |num_blocks| must be at most |AES_NOHW_BATCH|. + pub fn into_bytes(self, out: &mut [[u8; BLOCK_LEN]]) { + assert!(out.len() <= BATCH_SIZE); + + // TODO: Why did the original code copy `self`? + let mut copy = self; + copy.transpose(); + out.iter_mut().enumerate().for_each(|(i, out)| { + let block = copy.get(i); + uncompact_block(out, &block); + }); + } + + fn encrypt(mut self, key: &Schedule, rounds: usize, out: &mut [[u8; BLOCK_LEN]]) { + assert!(out.len() <= BATCH_SIZE); + self.add_round_key(&key.keys[0]); + key.keys[1..rounds].iter().for_each(|key| { + self.sub_bytes(); + self.shift_rows(); + self.mix_columns(); + self.add_round_key(key); + }); + self.sub_bytes(); + self.shift_rows(); + self.add_round_key(&key.keys[rounds]); + self.into_bytes(out); + } + + // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides + // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares + // and transposes each square. + fn transpose(&mut self) { + const _: () = assert!(BATCH_SIZE == 2 || BATCH_SIZE == 4); + + // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). + swap_bits::<0, 1, 0x55, 1>(&mut self.w); + swap_bits::<2, 3, 0x55, 1>(&mut self.w); + swap_bits::<4, 5, 0x55, 1>(&mut self.w); + swap_bits::<6, 7, 0x55, 1>(&mut self.w); + + if BATCH_SIZE >= 4 { + // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). + swap_bits::<0, 2, 0x33, 2>(&mut self.w); + swap_bits::<1, 3, 0x33, 2>(&mut self.w); + swap_bits::<4, 6, 0x33, 2>(&mut self.w); + swap_bits::<5, 7, 0x33, 2>(&mut self.w); + } + } +} + +#[inline(always)] +fn rotate_rows_down(v: Word) -> Word { + #[cfg(target_pointer_width = "64")] + { + ((v >> 4) & 0x0fff0fff0fff0fff) | ((v << 12) & 0xf000f000f000f000) + } + + #[cfg(target_pointer_width = "32")] + { + ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0) + } +} + +// rotate_rows_twice returns |v| with the rows in each column rotated +// by two. +#[inline(always)] +fn rotate_rows_twice(v: Word) -> Word { + #[cfg(target_pointer_width = "64")] + { + ((v >> 8) & 0x00ff00ff00ff00ff) | ((v << 8) & 0xff00ff00ff00ff00) + } + + #[cfg(target_pointer_width = "32")] + { + ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0) + } +} + +// Key schedule. + +// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is +// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| +// |AES_KEY|s so it should not be used as a long-term key representation. +struct Schedule { + // keys is an array of batches, one for each round key. Each batch stores + // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. + keys: [Batch; MAX_ROUNDS + 1], +} + +impl Schedule { + fn expand_round_keys(key: &AES_KEY) -> Self { + Self { + keys: array::from_fn(|i| { + let tmp: [Word; BLOCK_WORDS] = unsafe { core::mem::transmute(key.rd_key[i]) }; + + let mut r = Batch { w: [0; 8] }; + // Copy the round key into each block in the batch. + for j in 0..BATCH_SIZE { + r.set(&tmp, j); + } + r.transpose(); + r + }), + } + } +} + +static RCON: [u8; 10] = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36]; + +// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in +// |rcon|, stored in a |aes_word_t|. +#[inline(always)] +fn rcon_slice(rcon: u8, i: usize) -> Word { + let rcon = (rcon >> (i * BATCH_SIZE)) & ((1 << BATCH_SIZE) - 1); + rcon.into() +} + +pub(super) fn set_encrypt_key(key: &mut AES_KEY, bytes: KeyBytes) { + match bytes { + KeyBytes::AES_128(bytes) => setup_key_128(key, bytes), + KeyBytes::AES_256(bytes) => setup_key_256(key, bytes), + } +} + +fn setup_key_128(key: &mut AES_KEY, input: &[u8; 128 / 8]) { + key.rounds = 10; + + let mut block = compact_block(input); + key.rd_key[0] = unsafe { core::mem::transmute(block) }; + + key.rd_key[1..=10] + .iter_mut() + .zip(RCON) + .for_each(|(rd_key, rcon)| { + let sub = sub_block(&block); + *rd_key = derive_round_key(&mut block, sub, rcon); + }); +} + +pub(super) fn encrypt_block(key: &AES_KEY, in_out: &mut [u8; BLOCK_LEN]) { + let sched = Schedule::expand_round_keys(key); + let batch = Batch::from_bytes(core::slice::from_ref(in_out)); + batch.encrypt(&sched, usize_from_u32(key.rounds), array::from_mut(in_out)); +} + +fn setup_key_256(key: &mut AES_KEY, input: &[u8; 32]) { + key.rounds = 14; + + // Each key schedule iteration produces two round keys. + let (input, _) = polyfill::slice::as_chunks(input); + let mut block1 = compact_block(&input[0]); + key.rd_key[0] = unsafe { core::mem::transmute(block1) }; + let mut block2 = compact_block(&input[1]); + key.rd_key[1] = unsafe { core::mem::transmute(block2) }; + + key.rd_key[2..=14] + .chunks_mut(2) + .zip(RCON) + .for_each(|(rd_key_pair, rcon)| { + let sub = sub_block(&block2); + rd_key_pair[0] = derive_round_key(&mut block1, sub, rcon); + + if let Some(rd_key_2) = rd_key_pair.get_mut(1) { + let sub = sub_block(&block1); + block2.iter_mut().zip(sub).for_each(|(w, sub)| { + // Incorporate the transformed word into the first word. + *w ^= shift_right::<12>(sub); + // Propagate to the remaining words. + let v = *w; + *w ^= shift_left::<4>(v); + *w ^= shift_left::<8>(v); + *w ^= shift_left::<12>(v); + }); + *rd_key_2 = unsafe { core::mem::transmute(block2) }; + } + }); +} + +fn derive_round_key( + block: &mut [Word; BLOCK_WORDS], + sub: [Word; BLOCK_WORDS], + rcon: u8, +) -> [u32; 4] { + block + .iter_mut() + .zip(sub) + .enumerate() + .for_each(|(j, (w, sub))| { + // Incorporate |rcon| and the transformed word into the first word. + *w ^= rcon_slice(rcon, j); + *w ^= shift_right::<12>(rotate_rows_down(sub)); + // Propagate to the remaining words. + let v = *w; + *w ^= shift_left::<4>(v); + *w ^= shift_left::<8>(v); + *w ^= shift_left::<12>(v); + }); + unsafe { core::mem::transmute(*block) } +} + +fn sub_block(input: &[Word; BLOCK_WORDS]) -> [Word; BLOCK_WORDS] { + let mut batch = Batch { + w: Default::default(), + }; + batch.set(input, 0); + batch.transpose(); + batch.sub_bytes(); + batch.transpose(); + batch.get(0) +} + +pub(super) fn ctr32_encrypt_within( + key: &AES_KEY, + mut in_out: &mut [u8], + src: RangeFrom, + ctr: &mut Counter, +) { + let (input, leftover): (&[[u8; BLOCK_LEN]], _) = + polyfill::slice::as_chunks(&in_out[src.clone()]); + debug_assert_eq!(leftover.len(), 0); + if input.is_empty() { + return; + } + let blocks_u32 = u32::try_from(input.len()).unwrap(); + + let sched = Schedule::expand_round_keys(key); + + let initial_ctr = ctr.as_bytes_less_safe(); + ctr.increment_by_less_safe(blocks_u32); + + let mut ivs = [initial_ctr; BATCH_SIZE]; + let mut enc_ctrs = [[0u8; 16]; BATCH_SIZE]; + let initial_ctr: [[u8; 4]; 4] = initial_ctr.array_split_map(|x| x); + let mut ctr = u32::from_be_bytes(initial_ctr[3]); + + for _ in (0..).step_by(BATCH_SIZE) { + (0u32..).zip(ivs.iter_mut()).for_each(|(i, iv)| { + iv[12..].copy_from_slice(&u32::to_be_bytes(ctr + i)); + }); + + let (input, leftover): (&[[u8; BLOCK_LEN]], _) = + polyfill::slice::as_chunks(&in_out[src.clone()]); + debug_assert_eq!(leftover.len(), 0); + let todo = core::cmp::min(ivs.len(), input.len()); + let batch = Batch::from_bytes(&ivs[..todo]); + batch.encrypt(&sched, usize_from_u32(key.rounds), &mut enc_ctrs[..todo]); + constant_time::xor_within_chunked_at_start(in_out, src.clone(), &enc_ctrs[..todo]); + + if todo < BATCH_SIZE { + break; + } + in_out = &mut in_out[(BLOCK_LEN * BATCH_SIZE)..]; + ctr += BATCH_SIZE_U32; + } +} diff --git a/src/constant_time.rs b/src/constant_time.rs index dd18463b5e..65918e2c4a 100644 --- a/src/constant_time.rs +++ b/src/constant_time.rs @@ -14,7 +14,14 @@ //! Constant-time operations. -use crate::{c, error}; +use crate::{c, error, polyfill}; +use core::{cmp, ops::RangeFrom}; + +#[cfg(target_pointer_width = "64")] +pub(crate) type Word = u64; + +#[cfg(target_pointer_width = "32")] +pub(crate) type Word = u32; /// Returns `Ok(())` if `a == b` and `Err(error::Unspecified)` otherwise. /// The comparison of `a` and `b` is done in constant time with respect to the @@ -45,13 +52,54 @@ pub(crate) fn xor(mut a: [u8; N], b: [u8; N]) -> [u8; N] { /// XORs the first N bytes of `b` into `a`, where N is /// `core::cmp::min(a.len(), b.len())`. #[inline(always)] -pub(crate) fn xor_assign_at_start<'a>( +pub(crate) fn xor_assign_at_start_bytes<'a>( a: impl IntoIterator, b: impl IntoIterator, ) { a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b); } +/// XORs the first N words of `b` into `a`, where N is +/// `core::cmp::min(a.len(), b.len())`. +#[inline(always)] +pub(crate) fn xor_assign_at_start<'a>( + a: impl IntoIterator, + b: impl IntoIterator, +) { + a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b); +} +#[inline(always)] +pub(crate) fn xor_within_chunked_at_start( + in_out: &mut [u8], + src: RangeFrom, + b: &[[u8; INNER]], +) { + let (mut input, num_blocks) = { + let input = match in_out.get(src.clone()) { + Some(input) => input, + None => { + panic!() + } + }; + + let (input, _): (&[[u8; INNER]], _) = polyfill::slice::as_chunks(input); + let num_blocks = cmp::min(input.len(), b.len()); + (input.as_ptr(), num_blocks) + }; + let (output, _): (&mut [[u8; INNER]], _) = polyfill::slice::as_chunks_mut(in_out); + let output = &mut output[..num_blocks]; + + for (b, out) in (b[..num_blocks].iter()).zip(output) { + let a = unsafe { core::ptr::read(input) }; + out.iter_mut() + .zip(a.iter().zip(b)) + .for_each(|(out, (a, b))| { + *out = *a ^ *b; + }); + input = unsafe { input.add(1) }; + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/hmac.rs b/src/hmac.rs index 34984d62aa..a710f5c65f 100644 --- a/src/hmac.rs +++ b/src/hmac.rs @@ -234,7 +234,7 @@ impl Key { // If the key is shorter than one block then we're supposed to act like // it is padded with zero bytes up to the block length. `x ^ 0 == x` so // we can just leave the trailing bytes of `padded_key` untouched. - constant_time::xor_assign_at_start(&mut padded_key[..], key_value); + constant_time::xor_assign_at_start_bytes(&mut padded_key[..], key_value); let leftover = key.inner.update(padded_key, cpu_features); debug_assert_eq!(leftover.len(), 0); diff --git a/src/limb.rs b/src/limb.rs index 1fd6c27b5a..b18f929bab 100644 --- a/src/limb.rs +++ b/src/limb.rs @@ -21,34 +21,20 @@ use crate::{c, error, polyfill::ArrayFlatMap}; #[cfg(any(test, feature = "alloc"))] -use crate::bits; +use crate::{bits, constant_time, polyfill::usize_from_u32}; #[cfg(feature = "alloc")] use core::num::Wrapping; // XXX: Not correct for x32 ABIs. -#[cfg(target_pointer_width = "64")] -pub type Limb = u64; -#[cfg(target_pointer_width = "32")] -pub type Limb = u32; -#[cfg(target_pointer_width = "64")] -pub const LIMB_BITS: usize = 64; -#[cfg(target_pointer_width = "32")] -pub const LIMB_BITS: usize = 32; - -#[cfg(target_pointer_width = "64")] -#[derive(Debug, PartialEq)] -#[repr(u64)] -pub enum LimbMask { - True = 0xffff_ffff_ffff_ffff, - False = 0, -} +pub type Limb = constant_time::Word; +pub const LIMB_BITS: usize = usize_from_u32(Limb::BITS); -#[cfg(target_pointer_width = "32")] +#[cfg_attr(target_pointer_width = "64", repr(u64))] +#[cfg_attr(target_pointer_width = "32", repr(u32))] #[derive(Debug, PartialEq)] -#[repr(u32)] pub enum LimbMask { - True = 0xffff_ffff, + True = Limb::MAX, False = 0, } diff --git a/src/pbkdf2.rs b/src/pbkdf2.rs index 5a25f5d7f6..d5240fe182 100644 --- a/src/pbkdf2.rs +++ b/src/pbkdf2.rs @@ -189,7 +189,7 @@ fn derive_block(secret: &hmac::Key, iterations: NonZeroU32, salt: &[u8], idx: u3 let mut remaining: u32 = iterations.into(); loop { - constant_time::xor_assign_at_start(&mut out[..], u.as_ref()); + constant_time::xor_assign_at_start_bytes(&mut out[..], u.as_ref()); if remaining == 1 { break; diff --git a/src/polyfill.rs b/src/polyfill.rs index f09563c0d9..39296fc086 100644 --- a/src/polyfill.rs +++ b/src/polyfill.rs @@ -22,7 +22,7 @@ pub const fn u64_from_usize(x: usize) -> u64 { } #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] -pub fn usize_from_u32(x: u32) -> usize { +pub const fn usize_from_u32(x: u32) -> usize { x as usize } diff --git a/src/rsa/padding.rs b/src/rsa/padding.rs index 2fe7dda575..d544d5e852 100644 --- a/src/rsa/padding.rs +++ b/src/rsa/padding.rs @@ -74,7 +74,7 @@ fn mgf1(digest_alg: &'static digest::Algorithm, seed: &[u8], out: &mut [u8]) { // The last chunk may legitimately be shorter than `digest`, but // `digest` will never be shorter than `out`. - constant_time::xor_assign_at_start(out, digest.as_ref()); + constant_time::xor_assign_at_start_bytes(out, digest.as_ref()); } } diff --git a/src/rsa/padding/pss.rs b/src/rsa/padding/pss.rs index 35fc82be7c..4c4d048894 100644 --- a/src/rsa/padding/pss.rs +++ b/src/rsa/padding/pss.rs @@ -159,7 +159,7 @@ impl Verification for PSS { // Step 8. let db_rest = &mut db[1..]; let masked_bytes = masked_bytes.read_bytes(db_rest.len())?; - constant_time::xor_assign_at_start(db_rest, masked_bytes.as_slice_less_safe()); + constant_time::xor_assign_at_start_bytes(db_rest, masked_bytes.as_slice_less_safe()); Ok(()) })?;