diff --git a/Cargo.toml b/Cargo.toml
index 0898ddd1af..7160e0362c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,7 +46,6 @@ include = [
     "crypto/curve25519/curve25519_64_adx.c",
     "crypto/curve25519/curve25519_tables.h",
     "crypto/curve25519/internal.h",
-    "crypto/fipsmodule/aes/aes_nohw.c",
     "crypto/fipsmodule/aes/asm/aesni-x86.pl",
     "crypto/fipsmodule/aes/asm/aesni-x86_64.pl",
     "crypto/fipsmodule/aes/asm/aesv8-armx.pl",
@@ -106,7 +105,6 @@ include = [
     "crypto/cipher_extra/asm/chacha20_poly1305_armv8.pl",
     "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl",
     "examples/**/*.rs",
-    "include/ring-core/aes.h",
     "include/ring-core/arm_arch.h",
     "include/ring-core/asm_base.h",
     "include/ring-core/base.h",
diff --git a/build.rs b/build.rs
index 9d56a3ef05..2c473a3061 100644
--- a/build.rs
+++ b/build.rs
@@ -53,7 +53,6 @@ const WASM32: &str = "wasm32";
 #[rustfmt::skip]
 const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/curve25519/curve25519.c"),
-    (&[], "crypto/fipsmodule/aes/aes_nohw.c"),
     (&[], "crypto/fipsmodule/bn/montgomery.c"),
     (&[], "crypto/fipsmodule/bn/montgomery_inv.c"),
     (&[], "crypto/fipsmodule/ec/ecp_nistz.c"),
@@ -869,9 +868,6 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "aes_hw_ctr32_encrypt_blocks",
         "aes_hw_encrypt",
         "aes_hw_set_encrypt_key",
-        "aes_nohw_ctr32_encrypt_blocks",
-        "aes_nohw_encrypt",
-        "aes_nohw_set_encrypt_key",
         "aesni_gcm_decrypt",
         "aesni_gcm_encrypt",
         "bn_from_montgomery_in_place",
diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
deleted file mode 100644
index 9530cbc9b2..0000000000
--- a/crypto/fipsmodule/aes/aes_nohw.c
+++ /dev/null
@@ -1,881 +0,0 @@
-/* Copyright (c) 2019, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#include <ring-core/aes.h>
-
-#include "../../internal.h"
-
-// This file contains a constant-time implementation of AES, bitsliced with
-// 32-bit or 64-bit, operating on two-, four-, and eight-block
-// batches, respectively.
-//
-// This implementation is based on the algorithms described in the following
-// references:
-// - https://bearssl.org/constanttime.html#aes
-// - https://eprint.iacr.org/2009/129.pdf
-// - https://eprint.iacr.org/2009/191.pdf
-
-
-// Word operations.
-//
-// An aes_word_t is the word used for this AES implementation. Throughout this
-// file, bits and bytes are ordered little-endian, though "left" and "right"
-// shifts match the operations themselves, which makes them reversed in a
-// little-endian, left-to-right reading.
-//
-// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
-// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
-// bits each, each corresponding to a byte in an AES block in column-major
-// order (AES's byte order). We refer to these as "logical bytes". Note, in the
-// 32-bit and 64-bit implementations, they are smaller than a byte. (The
-// contents of a logical byte will be described later.)
-//
-// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
-// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
-// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
-// value ranges from 0 to 15 independent of |aes_word_t| and
-// |AES_NOHW_BATCH_SIZE|.
-//
-// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
-// uses row-major order. Matching the AES order was easier to reason about, and
-// we do not have PSHUFB available to arbitrarily permute bytes.
-
-#if defined(OPENSSL_64_BIT)
-typedef uint64_t aes_word_t;
-#define AES_NOHW_WORD_SIZE 8
-#define AES_NOHW_BATCH_SIZE 4
-#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
-#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
-#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
-#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
-#else  // !OPENSSL_64_BIT
-typedef uint32_t aes_word_t;
-#define AES_NOHW_WORD_SIZE 4
-#define AES_NOHW_BATCH_SIZE 2
-#define AES_NOHW_ROW0_MASK 0x03030303
-#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
-#define AES_NOHW_ROW2_MASK 0x30303030
-#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
-#endif  // OPENSSL_64_BIT
-
-static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
-  return a & b;
-}
-
-static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
-  return a | b;
-}
-
-static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
-  return a ^ b;
-}
-
-static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
-
-static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
-  return a << (i * AES_NOHW_BATCH_SIZE);
-}
-
-static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
-  return a >> (i * AES_NOHW_BATCH_SIZE);
-}
-
-OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
-                      "batch size does not match word size");
-OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
-                      "AES_NOHW_WORD_SIZE is incorrect");
-
-
-// Block representations.
-//
-// This implementation uses three representations for AES blocks. First, the
-// public API represents blocks as uint8_t[16] in the usual way. Second, most
-// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
-// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
-// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
-// bars divide logical bytes):
-//
-//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
-//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
-//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
-//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
-//   ...
-//
-// Finally, an individual block may be stored as an intermediate form in an
-// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
-// block, so that block[0]'s ith logical byte contains least-significant
-// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
-// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
-// "compacting" the block. Note this is no-op with 128-bit words because then
-// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
-// words, one block would be stored in two words:
-//
-//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
-//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
-//
-// Observe that the distances between corresponding bits in bitsliced and
-// compact bit orders match. If we line up corresponding words of each block,
-// the bitsliced and compact representations may be converted by tranposing bits
-// in corresponding logical bytes. Continuing the 64-bit example:
-//
-//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
-//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
-//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
-//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
-//
-//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
-//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
-//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
-//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
-//
-// Note also that bitwise operations and (logical) byte permutations on an
-// |aes_word_t| work equally for the bitsliced and compact words.
-//
-// We use the compact form in the |AES_KEY| representation to save work
-// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
-// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
-// before or after |aes_nohw_transpose|.
-
-#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
-
-// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
-// specified, it is in bitsliced form.
-typedef struct {
-  aes_word_t w[8];
-} AES_NOHW_BATCH;
-
-// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
-// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
-// |AES_KEY|s so it should not be used as a long-term key representation.
-typedef struct {
-  // keys is an array of batches, one for each round key. Each batch stores
-  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
-  AES_NOHW_BATCH keys[AES_MAXNR + 1];
-} AES_NOHW_SCHEDULE;
-
-// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
-// compact form.
-static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
-                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
-                                      size_t i) {
-  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
-  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
-  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
-  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
-  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
-  // will be correctly placed.)
-  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
-#if defined(OPENSSL_64_BIT)
-  batch->w[i] = in[0];
-  batch->w[i + 4] = in[1];
-#else
-  batch->w[i] = in[0];
-  batch->w[i + 2] = in[1];
-  batch->w[i + 4] = in[2];
-  batch->w[i + 6] = in[3];
-#endif
-}
-
-// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
-// compact form.
-static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
-                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                                      size_t i) {
-  dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
-#if defined(OPENSSL_64_BIT)
-  out[0] = batch->w[i];
-  out[1] = batch->w[i + 4];
-#else
-  out[0] = batch->w[i];
-  out[1] = batch->w[i + 2];
-  out[2] = batch->w[i + 4];
-  out[3] = batch->w[i + 6];
-#endif
-}
-
-// aes_nohw_delta_swap returns |a| with bits |a & mask| and
-// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
-static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
-                                             aes_word_t shift) {
-  // See
-  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
-  aes_word_t b = (a ^ (a >> shift)) & mask;
-  return a ^ b ^ (b << shift);
-}
-
-// In the 32-bit and 64-bit implementations, a block spans multiple words.
-// |aes_nohw_compact_block| must permute bits across different words. First we
-// implement |aes_nohw_compact_word| which performs a smaller version of the
-// transformation which stays within a single word.
-//
-// These transformations are generalizations of the output of
-// http://programming.sirrida.de/calcperm.php on smaller inputs.
-#if defined(OPENSSL_64_BIT)
-static inline uint64_t aes_nohw_compact_word(uint64_t a) {
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap8(a);
-#endif
-  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
-  // quartets of those chunks:
-  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
-  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
-  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
-  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
-  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
-  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
-  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
-  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
-  return a;
-}
-
-static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
-  // Reverse the steps of |aes_nohw_uncompact_word|.
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
-  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
-  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap8(a);
-#endif
-  return a;
-}
-#else   // !OPENSSL_64_BIT
-static inline uint32_t aes_nohw_compact_word(uint32_t a) {
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap4(a);
-#endif
-  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
-  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
-  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
-  // Note:  0x00cc = 0b0000_0000_1100_1100
-  //   0x00cc << 6 = 0b0011_0011_0000_0000
-  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
-  // Now we swap groups of four bits (still numbering by pairs):
-  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
-  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
-  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
-  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
-  return a;
-}
-
-static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
-  // Reverse the steps of |aes_nohw_uncompact_word|.
-  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
-  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
-#if defined(RING_BIG_ENDIAN)
-  a = CRYPTO_bswap4(a);
-#endif
-  return a;
-}
-
-static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
-                                                uint8_t a2, uint8_t a3) {
-  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
-         ((uint32_t)a3 << 24);
-}
-
-static inline uint8_t lo(uint32_t a) {
-  return (uint8_t)a;
-}
-
-#endif  // OPENSSL_64_BIT
-
-static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                                          const uint8_t in[16]) {
-  OPENSSL_memcpy(out, in, 16);
-#if defined(OPENSSL_64_BIT)
-  uint64_t a0 = aes_nohw_compact_word(out[0]);
-  uint64_t a1 = aes_nohw_compact_word(out[1]);
-  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
-  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
-#else
-  uint32_t a0 = aes_nohw_compact_word(out[0]);
-  uint32_t a1 = aes_nohw_compact_word(out[1]);
-  uint32_t a2 = aes_nohw_compact_word(out[2]);
-  uint32_t a3 = aes_nohw_compact_word(out[3]);
-  // Note clang, when building for ARM Thumb2, will sometimes miscompile
-  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
-  // without optimizations. This bug was introduced in
-  // https://reviews.llvm.org/rL340261 and fixed in
-  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
-  out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
-  out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
-  out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
-  out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
-#endif
-}
-
-static inline void aes_nohw_uncompact_block(
-    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
-#if defined(OPENSSL_64_BIT)
-  uint64_t a0 = in[0];
-  uint64_t a1 = in[1];
-  uint64_t b0 =
-      aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
-  uint64_t b1 =
-      aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
-  OPENSSL_memcpy(out, &b0, 8);
-  OPENSSL_memcpy(out + 8, &b1, 8);
-#else
-  uint32_t a0 = in[0];
-  uint32_t a1 = in[1];
-  uint32_t a2 = in[2];
-  uint32_t a3 = in[3];
-  // Note clang, when building for ARM Thumb2, will sometimes miscompile
-  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
-  // without optimizations. This bug was introduced in
-  // https://reviews.llvm.org/rL340261 and fixed in
-  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
-  uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
-  uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
-  uint32_t b2 =
-      aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
-  uint32_t b3 =
-      aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
-  b0 = aes_nohw_uncompact_word(b0);
-  b1 = aes_nohw_uncompact_word(b1);
-  b2 = aes_nohw_uncompact_word(b2);
-  b3 = aes_nohw_uncompact_word(b3);
-  OPENSSL_memcpy(out, &b0, 4);
-  OPENSSL_memcpy(out + 4, &b1, 4);
-  OPENSSL_memcpy(out + 8, &b2, 4);
-  OPENSSL_memcpy(out + 12, &b3, 4);
-#endif
-}
-
-// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
-// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
-// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
-// is repeated to the full width of |aes_word_t|.
-static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
-                                      uint32_t mask, aes_word_t shift) {
-#if defined(OPENSSL_64_BIT)
-  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
-#else
-  aes_word_t mask_w = mask;
-#endif
-  // This is a variation on a delta swap.
-  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
-  *a ^= swap << shift;
-  *b ^= swap;
-}
-
-// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
-// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
-// and transposes each square.
-static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
-  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
-  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
-  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
-  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
-  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
-
-#if AES_NOHW_BATCH_SIZE >= 4
-  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
-  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
-  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
-  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
-  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
-#endif
-
-#if AES_NOHW_BATCH_SIZE >= 8
-  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
-  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
-  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
-  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
-  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
-#endif
-}
-
-// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
-// |num_blocks| must be at most |AES_NOHW_BATCH|.
-static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
-                              size_t num_blocks) {
-  // Don't leave unused blocks uninitialized.
-  OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH));
-  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
-  for (size_t i = 0; i < num_blocks; i++) {
-    aes_word_t block[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_compact_block(block, in + 16 * i);
-    aes_nohw_batch_set(out, block, i);
-  }
-
-  aes_nohw_transpose(out);
-}
-
-// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
-// |num_blocks| must be at most |AES_NOHW_BATCH|.
-static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
-                                const AES_NOHW_BATCH *batch) {
-  AES_NOHW_BATCH copy = *batch;
-  aes_nohw_transpose(&copy);
-
-  debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
-  for (size_t i = 0; i < num_blocks; i++) {
-    aes_word_t block[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_batch_get(&copy, block, i);
-    aes_nohw_uncompact_block(out + 16 * i, block);
-  }
-}
-
-
-// AES round steps.
-
-static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
-                                   const AES_NOHW_BATCH *key) {
-  for (size_t i = 0; i < 8; i++) {
-    batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
-  }
-}
-
-static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
-  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
-  aes_word_t x0 = batch->w[7];
-  aes_word_t x1 = batch->w[6];
-  aes_word_t x2 = batch->w[5];
-  aes_word_t x3 = batch->w[4];
-  aes_word_t x4 = batch->w[3];
-  aes_word_t x5 = batch->w[2];
-  aes_word_t x6 = batch->w[1];
-  aes_word_t x7 = batch->w[0];
-
-  // Figure 2, the top linear transformation.
-  aes_word_t y14 = aes_nohw_xor(x3, x5);
-  aes_word_t y13 = aes_nohw_xor(x0, x6);
-  aes_word_t y9 = aes_nohw_xor(x0, x3);
-  aes_word_t y8 = aes_nohw_xor(x0, x5);
-  aes_word_t t0 = aes_nohw_xor(x1, x2);
-  aes_word_t y1 = aes_nohw_xor(t0, x7);
-  aes_word_t y4 = aes_nohw_xor(y1, x3);
-  aes_word_t y12 = aes_nohw_xor(y13, y14);
-  aes_word_t y2 = aes_nohw_xor(y1, x0);
-  aes_word_t y5 = aes_nohw_xor(y1, x6);
-  aes_word_t y3 = aes_nohw_xor(y5, y8);
-  aes_word_t t1 = aes_nohw_xor(x4, y12);
-  aes_word_t y15 = aes_nohw_xor(t1, x5);
-  aes_word_t y20 = aes_nohw_xor(t1, x1);
-  aes_word_t y6 = aes_nohw_xor(y15, x7);
-  aes_word_t y10 = aes_nohw_xor(y15, t0);
-  aes_word_t y11 = aes_nohw_xor(y20, y9);
-  aes_word_t y7 = aes_nohw_xor(x7, y11);
-  aes_word_t y17 = aes_nohw_xor(y10, y11);
-  aes_word_t y19 = aes_nohw_xor(y10, y8);
-  aes_word_t y16 = aes_nohw_xor(t0, y11);
-  aes_word_t y21 = aes_nohw_xor(y13, y16);
-  aes_word_t y18 = aes_nohw_xor(x0, y16);
-
-  // Figure 3, the middle non-linear section.
-  aes_word_t t2 = aes_nohw_and(y12, y15);
-  aes_word_t t3 = aes_nohw_and(y3, y6);
-  aes_word_t t4 = aes_nohw_xor(t3, t2);
-  aes_word_t t5 = aes_nohw_and(y4, x7);
-  aes_word_t t6 = aes_nohw_xor(t5, t2);
-  aes_word_t t7 = aes_nohw_and(y13, y16);
-  aes_word_t t8 = aes_nohw_and(y5, y1);
-  aes_word_t t9 = aes_nohw_xor(t8, t7);
-  aes_word_t t10 = aes_nohw_and(y2, y7);
-  aes_word_t t11 = aes_nohw_xor(t10, t7);
-  aes_word_t t12 = aes_nohw_and(y9, y11);
-  aes_word_t t13 = aes_nohw_and(y14, y17);
-  aes_word_t t14 = aes_nohw_xor(t13, t12);
-  aes_word_t t15 = aes_nohw_and(y8, y10);
-  aes_word_t t16 = aes_nohw_xor(t15, t12);
-  aes_word_t t17 = aes_nohw_xor(t4, t14);
-  aes_word_t t18 = aes_nohw_xor(t6, t16);
-  aes_word_t t19 = aes_nohw_xor(t9, t14);
-  aes_word_t t20 = aes_nohw_xor(t11, t16);
-  aes_word_t t21 = aes_nohw_xor(t17, y20);
-  aes_word_t t22 = aes_nohw_xor(t18, y19);
-  aes_word_t t23 = aes_nohw_xor(t19, y21);
-  aes_word_t t24 = aes_nohw_xor(t20, y18);
-  aes_word_t t25 = aes_nohw_xor(t21, t22);
-  aes_word_t t26 = aes_nohw_and(t21, t23);
-  aes_word_t t27 = aes_nohw_xor(t24, t26);
-  aes_word_t t28 = aes_nohw_and(t25, t27);
-  aes_word_t t29 = aes_nohw_xor(t28, t22);
-  aes_word_t t30 = aes_nohw_xor(t23, t24);
-  aes_word_t t31 = aes_nohw_xor(t22, t26);
-  aes_word_t t32 = aes_nohw_and(t31, t30);
-  aes_word_t t33 = aes_nohw_xor(t32, t24);
-  aes_word_t t34 = aes_nohw_xor(t23, t33);
-  aes_word_t t35 = aes_nohw_xor(t27, t33);
-  aes_word_t t36 = aes_nohw_and(t24, t35);
-  aes_word_t t37 = aes_nohw_xor(t36, t34);
-  aes_word_t t38 = aes_nohw_xor(t27, t36);
-  aes_word_t t39 = aes_nohw_and(t29, t38);
-  aes_word_t t40 = aes_nohw_xor(t25, t39);
-  aes_word_t t41 = aes_nohw_xor(t40, t37);
-  aes_word_t t42 = aes_nohw_xor(t29, t33);
-  aes_word_t t43 = aes_nohw_xor(t29, t40);
-  aes_word_t t44 = aes_nohw_xor(t33, t37);
-  aes_word_t t45 = aes_nohw_xor(t42, t41);
-  aes_word_t z0 = aes_nohw_and(t44, y15);
-  aes_word_t z1 = aes_nohw_and(t37, y6);
-  aes_word_t z2 = aes_nohw_and(t33, x7);
-  aes_word_t z3 = aes_nohw_and(t43, y16);
-  aes_word_t z4 = aes_nohw_and(t40, y1);
-  aes_word_t z5 = aes_nohw_and(t29, y7);
-  aes_word_t z6 = aes_nohw_and(t42, y11);
-  aes_word_t z7 = aes_nohw_and(t45, y17);
-  aes_word_t z8 = aes_nohw_and(t41, y10);
-  aes_word_t z9 = aes_nohw_and(t44, y12);
-  aes_word_t z10 = aes_nohw_and(t37, y3);
-  aes_word_t z11 = aes_nohw_and(t33, y4);
-  aes_word_t z12 = aes_nohw_and(t43, y13);
-  aes_word_t z13 = aes_nohw_and(t40, y5);
-  aes_word_t z14 = aes_nohw_and(t29, y2);
-  aes_word_t z15 = aes_nohw_and(t42, y9);
-  aes_word_t z16 = aes_nohw_and(t45, y14);
-  aes_word_t z17 = aes_nohw_and(t41, y8);
-
-  // Figure 4, bottom linear transformation.
-  aes_word_t t46 = aes_nohw_xor(z15, z16);
-  aes_word_t t47 = aes_nohw_xor(z10, z11);
-  aes_word_t t48 = aes_nohw_xor(z5, z13);
-  aes_word_t t49 = aes_nohw_xor(z9, z10);
-  aes_word_t t50 = aes_nohw_xor(z2, z12);
-  aes_word_t t51 = aes_nohw_xor(z2, z5);
-  aes_word_t t52 = aes_nohw_xor(z7, z8);
-  aes_word_t t53 = aes_nohw_xor(z0, z3);
-  aes_word_t t54 = aes_nohw_xor(z6, z7);
-  aes_word_t t55 = aes_nohw_xor(z16, z17);
-  aes_word_t t56 = aes_nohw_xor(z12, t48);
-  aes_word_t t57 = aes_nohw_xor(t50, t53);
-  aes_word_t t58 = aes_nohw_xor(z4, t46);
-  aes_word_t t59 = aes_nohw_xor(z3, t54);
-  aes_word_t t60 = aes_nohw_xor(t46, t57);
-  aes_word_t t61 = aes_nohw_xor(z14, t57);
-  aes_word_t t62 = aes_nohw_xor(t52, t58);
-  aes_word_t t63 = aes_nohw_xor(t49, t58);
-  aes_word_t t64 = aes_nohw_xor(z4, t59);
-  aes_word_t t65 = aes_nohw_xor(t61, t62);
-  aes_word_t t66 = aes_nohw_xor(z1, t63);
-  aes_word_t s0 = aes_nohw_xor(t59, t63);
-  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
-  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
-  aes_word_t t67 = aes_nohw_xor(t64, t65);
-  aes_word_t s3 = aes_nohw_xor(t53, t66);
-  aes_word_t s4 = aes_nohw_xor(t51, t66);
-  aes_word_t s5 = aes_nohw_xor(t47, t65);
-  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
-  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
-
-  batch->w[0] = s7;
-  batch->w[1] = s6;
-  batch->w[2] = s5;
-  batch->w[3] = s4;
-  batch->w[4] = s3;
-  batch->w[5] = s2;
-  batch->w[6] = s1;
-  batch->w[7] = s0;
-}
-
-// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
-// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
-// constant shift counts in the SSE2 implementation.
-#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
-  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4),                      \
-               aes_nohw_shift_left((v), 16 - (n)*4)))
-
-static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
-  for (size_t i = 0; i < 8; i++) {
-    aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
-    aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
-    aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
-    aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
-    row1 = aes_nohw_rotate_cols_right(row1, 1);
-    row2 = aes_nohw_rotate_cols_right(row2, 2);
-    row3 = aes_nohw_rotate_cols_right(row3, 3);
-    batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
-  }
-}
-
-// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
-// down by one.
-static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
-#if defined(OPENSSL_64_BIT)
-  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
-         ((v << 12) & UINT64_C(0xf000f000f000f000));
-#else
-  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
-#endif
-}
-
-// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
-// by two.
-static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
-#if defined(OPENSSL_64_BIT)
-  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
-         ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
-#else
-  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
-#endif
-}
-
-static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
-  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
-  aes_word_t a0 = batch->w[0];
-  aes_word_t a1 = batch->w[1];
-  aes_word_t a2 = batch->w[2];
-  aes_word_t a3 = batch->w[3];
-  aes_word_t a4 = batch->w[4];
-  aes_word_t a5 = batch->w[5];
-  aes_word_t a6 = batch->w[6];
-  aes_word_t a7 = batch->w[7];
-
-  aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
-  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
-  aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
-  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
-  aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
-  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
-  aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
-  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
-  aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
-  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
-  aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
-  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
-  aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
-  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
-  aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
-  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
-
-  batch->w[0] =
-      aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
-  batch->w[1] =
-      aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
-                   aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
-  batch->w[2] =
-      aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
-  batch->w[3] =
-      aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
-                   aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
-  batch->w[4] =
-      aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
-                   aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
-  batch->w[5] =
-      aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
-  batch->w[6] =
-      aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
-  batch->w[7] =
-      aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
-}
-
-static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
-                                   size_t num_rounds, AES_NOHW_BATCH *batch) {
-  aes_nohw_add_round_key(batch, &key->keys[0]);
-  for (size_t i = 1; i < num_rounds; i++) {
-    aes_nohw_sub_bytes(batch);
-    aes_nohw_shift_rows(batch);
-    aes_nohw_mix_columns(batch);
-    aes_nohw_add_round_key(batch, &key->keys[i]);
-  }
-  aes_nohw_sub_bytes(batch);
-  aes_nohw_shift_rows(batch);
-  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
-}
-
-// Key schedule.
-
-static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
-                                       const AES_KEY *key) {
-  for (size_t i = 0; i <= key->rounds; i++) {
-    // Copy the round key into each block in the batch.
-    for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
-      aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
-      OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16);
-      aes_nohw_batch_set(&out->keys[i], tmp, j);
-    }
-    aes_nohw_transpose(&out->keys[i]);
-  }
-}
-
-static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
-                                          0x20, 0x40, 0x80, 0x1b, 0x36};
-
-// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
-// |rcon|, stored in a |aes_word_t|.
-static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
-  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
-  return ((aes_word_t)rcon);
-}
-
-static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
-                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
-  AES_NOHW_BATCH batch;
-  OPENSSL_memset(&batch, 0, sizeof(batch));
-  aes_nohw_batch_set(&batch, in, 0);
-  aes_nohw_transpose(&batch);
-  aes_nohw_sub_bytes(&batch);
-  aes_nohw_transpose(&batch);
-  aes_nohw_batch_get(&batch, out, 0);
-}
-
-static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
-  key->rounds = 10;
-
-  aes_word_t block[AES_NOHW_BLOCK_WORDS];
-  aes_nohw_compact_block(block, in);
-  OPENSSL_memcpy(key->rd_key, block, 16);
-
-  for (size_t i = 1; i <= 10; i++) {
-    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_sub_block(sub, block);
-    uint8_t rcon = aes_nohw_rcon[i - 1];
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Incorporate |rcon| and the transformed word into the first word.
-      block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
-      block[j] = aes_nohw_xor(
-          block[j],
-          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
-      // Propagate to the remaining words. Note this is reordered from the usual
-      // formulation to avoid needing masks.
-      aes_word_t v = block[j];
-      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
-      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
-      block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
-    }
-    OPENSSL_memcpy(key->rd_key + 4 * i, block, 16);
-  }
-}
-
-static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
-  key->rounds = 14;
-
-  // Each key schedule iteration produces two round keys.
-  aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
-  aes_nohw_compact_block(block1, in);
-  OPENSSL_memcpy(key->rd_key, block1, 16);
-
-  aes_nohw_compact_block(block2, in + 16);
-  OPENSSL_memcpy(key->rd_key + 4, block2, 16);
-
-  for (size_t i = 2; i <= 14; i += 2) {
-    aes_word_t sub[AES_NOHW_BLOCK_WORDS];
-    aes_nohw_sub_block(sub, block2);
-    uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Incorporate |rcon| and the transformed word into the first word.
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
-      block1[j] = aes_nohw_xor(
-          block1[j],
-          aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
-      // Propagate to the remaining words.
-      aes_word_t v = block1[j];
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
-      block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
-    }
-    OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16);
-
-    if (i == 14) {
-      break;
-    }
-
-    aes_nohw_sub_block(sub, block1);
-    for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
-      // Incorporate the transformed word into the first word.
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
-      // Propagate to the remaining words.
-      aes_word_t v = block2[j];
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
-      block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
-    }
-    OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
-  }
-}
-
-
-// External API.
-
-int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
-                             AES_KEY *aeskey) {
-  switch (bits) {
-    case 128:
-      aes_nohw_setup_key_128(aeskey, key);
-      return 0;
-    case 256:
-      aes_nohw_setup_key_256(aeskey, key);
-      return 0;
-  }
-  return 1;
-}
-
-void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
-  AES_NOHW_SCHEDULE sched;
-  aes_nohw_expand_round_keys(&sched, key);
-  AES_NOHW_BATCH batch;
-  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
-  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
-  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
-}
-
-static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
-                                      const uint8_t b[16]) {
-  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
-    aes_word_t x, y;
-    OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t));
-    OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t));
-    x = aes_nohw_xor(x, y);
-    OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t));
-  }
-}
-
-void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
-                                   size_t blocks, const AES_KEY *key,
-                                   const uint8_t ivec[16]) {
-  if (blocks == 0) {
-    return;
-  }
-
-  AES_NOHW_SCHEDULE sched;
-  aes_nohw_expand_round_keys(&sched, key);
-
-  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
-  alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
-  alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
-  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-    OPENSSL_memcpy(ivs + 16 * i, ivec, 16);
-  }
-
-  uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
-  for (;;) {
-    // Update counters.
-    for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-      CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
-    }
-
-    size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
-    AES_NOHW_BATCH batch;
-    aes_nohw_to_batch(&batch, ivs, todo);
-    aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
-    aes_nohw_from_batch(enc_ivs, todo, &batch);
-
-    for (size_t i = 0; i < todo; i++) {
-      aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
-    }
-
-    blocks -= todo;
-    if (blocks == 0) {
-      break;
-    }
-
-    in += 16 * AES_NOHW_BATCH_SIZE;
-    out += 16 * AES_NOHW_BATCH_SIZE;
-    ctr += AES_NOHW_BATCH_SIZE;
-  }
-}
diff --git a/crypto/internal.h b/crypto/internal.h
index d56735eab6..63b0b8d68b 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -378,18 +378,6 @@ static inline crypto_word_t constant_time_declassify_w(crypto_word_t v) {
 static inline uint32_t CRYPTO_bswap4(uint32_t x) {
   return __builtin_bswap32(x);
 }
-
-static inline uint64_t CRYPTO_bswap8(uint64_t x) {
-  return __builtin_bswap64(x);
-}
-#elif defined(_MSC_VER)
-#pragma warning(push, 3)
-#include <stdlib.h>
-#pragma warning(pop)
-#pragma intrinsic(_byteswap_ulong)
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  return _byteswap_ulong(x);
-}
 #endif
 
 #if !defined(RING_CORE_NOSTDLIBINC)
@@ -457,23 +445,6 @@ static inline void CRYPTO_store_u32_le(void *out, uint32_t v) {
   OPENSSL_memcpy(out, &v, sizeof(v));
 }
 
-static inline uint32_t CRYPTO_load_u32_be(const void *in) {
-  uint32_t v;
-  OPENSSL_memcpy(&v, in, sizeof(v));
-#if !defined(RING_BIG_ENDIAN)
-  return CRYPTO_bswap4(v);
-#else
-  return v;
-#endif
-}
-
-static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
-#if !defined(RING_BIG_ENDIAN)
-  v = CRYPTO_bswap4(v);
-#endif
-  OPENSSL_memcpy(out, &v, sizeof(v));
-}
-
 // Runtime CPU feature support
 
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
diff --git a/include/ring-core/aes.h b/include/ring-core/aes.h
deleted file mode 100644
index 5b5130dad7..0000000000
--- a/include/ring-core/aes.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* ====================================================================
- * Copyright (c) 2002-2006 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ==================================================================== */
-
-#ifndef OPENSSL_HEADER_AES_H
-#define OPENSSL_HEADER_AES_H
-
-#include <ring-core/base.h>
-
-// Raw AES functions.
-
-
-// AES_MAXNR is the maximum number of AES rounds.
-#define AES_MAXNR 14
-
-// aes_key_st should be an opaque type, but EVP requires that the size be
-// known.
-struct aes_key_st {
-  uint32_t rd_key[4 * (AES_MAXNR + 1)];
-  unsigned rounds;
-};
-typedef struct aes_key_st AES_KEY;
-
-#endif  // OPENSSL_HEADER_AES_H
diff --git a/src/aead/aes.rs b/src/aead/aes.rs
index 15802da29d..e3b09fd250 100644
--- a/src/aead/aes.rs
+++ b/src/aead/aes.rs
@@ -12,6 +12,8 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
+mod aes_nohw;
+
 use super::{nonce::Nonce, quic::Sample};
 use crate::{
     bits::BitLength,
@@ -177,7 +179,7 @@ impl Key {
         cpu_features: cpu::Features,
     ) -> Result<Self, error::Unspecified> {
         let mut key = AES_KEY {
-            rd_key: [0u32; 4 * (MAX_ROUNDS + 1)],
+            rd_key: [[0u32; 4]; MAX_ROUNDS + 1],
             rounds: 0,
         };
 
@@ -203,9 +205,7 @@ impl Key {
 
             // SAFETY: `aes_nohw_set_encrypt_key` satisfies the `set_encrypt_key!`
             // contract.
-            Implementation::NOHW => unsafe {
-                set_encrypt_key!(aes_nohw_set_encrypt_key, bytes, &mut key, cpu_features)?;
-            },
+            Implementation::NOHW => aes_nohw::set_encrypt_key(&mut key, bytes),
         };
 
         Ok(Self { inner: key })
@@ -225,7 +225,11 @@ impl Key {
             ))]
             Implementation::VPAES_BSAES => encrypt_block!(vpaes_encrypt, a, self),
 
-            Implementation::NOHW => encrypt_block!(aes_nohw_encrypt, a, self),
+            Implementation::NOHW => {
+                let mut in_out = a;
+                aes_nohw::encrypt_block(&self.inner, &mut in_out);
+                in_out
+            }
         }
     }
 
@@ -327,16 +331,7 @@ impl Key {
             //    above, as required by `aes_nohw_ctr32_encrypt_blocks`.
             //  * `aes_nohw_ctr32_encrypt_blocks` satisfies the contract for
             //    `ctr32_encrypt_blocks`.
-            Implementation::NOHW => unsafe {
-                ctr32_encrypt_blocks!(
-                    aes_nohw_ctr32_encrypt_blocks,
-                    in_out,
-                    src,
-                    &self.inner,
-                    ctr,
-                    cpu_features
-                )
-            },
+            Implementation::NOHW => aes_nohw::ctr32_encrypt_within(&self.inner, in_out, src, ctr),
         }
     }
 
@@ -358,15 +353,13 @@ impl Key {
     }
 }
 
-// Keep this in sync with AES_KEY in aes.h.
 #[repr(C)]
 #[derive(Clone)]
 pub(super) struct AES_KEY {
-    pub rd_key: [u32; 4 * (MAX_ROUNDS + 1)],
+    pub rd_key: [[u32; 4]; MAX_ROUNDS + 1],
     pub rounds: c::uint,
 }
 
-// Keep this in sync with `AES_MAXNR` in aes.h.
 const MAX_ROUNDS: usize = 14;
 
 pub const AES_128_KEY_LEN: usize = 128 / 8;
@@ -399,6 +392,10 @@ impl Counter {
         let new_value = old_value + increment_by;
         [*c0, *c1, *c2, *c3] = u32::to_be_bytes(new_value);
     }
+
+    pub(super) fn as_bytes_less_safe(&self) -> [u8; 16] {
+        self.0
+    }
 }
 
 /// The IV for a single block encryption.
@@ -510,7 +507,7 @@ unsafe fn bsaes_ctr32_encrypt_blocks_with_vpaes_key(
     }
 
     let mut bsaes_key = AES_KEY {
-        rd_key: [0u32; 4 * (MAX_ROUNDS + 1)],
+        rd_key: [[0u32; 4]; MAX_ROUNDS + 1],
         rounds: 0,
     };
     // SAFETY:
diff --git a/src/aead/aes/aes_nohw.rs b/src/aead/aes/aes_nohw.rs
new file mode 100644
index 0000000000..77cc97ddbf
--- /dev/null
+++ b/src/aead/aes/aes_nohw.rs
@@ -0,0 +1,786 @@
+// Copyright (c) 2019, Google Inc.
+// Portions Copyright 2024 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{Counter, KeyBytes, AES_KEY, BLOCK_LEN, MAX_ROUNDS};
+use crate::{
+    constant_time,
+    polyfill::{self, usize_from_u32, ArraySplitMap as _},
+};
+use cfg_if::cfg_if;
+use core::{array, ops::RangeFrom};
+
+type Word = constant_time::Word;
+const WORD_SIZE: usize = core::mem::size_of::<Word>();
+const BATCH_SIZE: usize = WORD_SIZE / 2;
+#[allow(clippy::cast_possible_truncation)]
+const BATCH_SIZE_U32: u32 = BATCH_SIZE as u32;
+
+const BLOCK_WORDS: usize = 16 / WORD_SIZE;
+
+cfg_if! {
+    if #[cfg(target_pointer_width = "64")] {
+        const ROW0_MASK: Word = 0x000f000f000f000f;
+        const ROW1_MASK: Word = 0x00f000f000f000f0;
+        const ROW2_MASK: Word = 0x0f000f000f000f00;
+        const ROW3_MASK: Word = 0xf000f000f000f000;
+    } else if #[cfg(target_pointer_width = "32")] {
+        const ROW0_MASK: Word = 0x03030303;
+        const ROW1_MASK: Word = 0x0c0c0c0c;
+        const ROW2_MASK: Word = 0x30303030;
+        const ROW3_MASK: Word = 0xc0c0c0c0;
+    }
+}
+
+#[inline(always)]
+fn and(a: Word, b: Word) -> Word {
+    a & b
+}
+
+#[inline(always)]
+fn or(a: Word, b: Word) -> Word {
+    a | b
+}
+
+#[inline(always)]
+fn xor(a: Word, b: Word) -> Word {
+    a ^ b
+}
+
+#[inline(always)]
+fn not(a: Word) -> Word {
+    !a
+}
+
+#[inline(always)]
+fn shift_left<const I: u32>(a: Word) -> Word {
+    a << (I * BATCH_SIZE_U32)
+}
+
+#[inline(always)]
+fn shift_right<const I: u32>(a: Word) -> Word {
+    a >> (I * BATCH_SIZE_U32)
+}
+
+// aes_nohw_delta_swap returns |a| with bits |a & mask| and
+// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
+#[inline(always)]
+fn delta_swap<const MASK: Word, const SHIFT: u8>(a: Word) -> Word {
+    // See
+    // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
+    let b = (a ^ (a >> SHIFT)) & MASK;
+    a ^ b ^ (b << SHIFT)
+}
+
+// In the 32-bit and 64-bit implementations, a block spans multiple words.
+// |aes_nohw_compact_block| must permute bits across different words. First we
+// implement |aes_nohw_compact_word| which performs a smaller version of the
+// transformation which stays within a single word.
+//
+// These transformations are generalizations of the output of
+// http://programming.sirrida.de/calcperm.php on smaller inputs.
+#[inline(always)]
+fn compact_word(a: Word) -> Word {
+    let a = Word::from_le(a);
+    cfg_if! {
+        if #[cfg(target_pointer_width = "64")] {
+            // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
+            // quartets of those chunks:
+            //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+            //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
+            let a = delta_swap::<0x00f000f000f000f0, 4>(a);
+            // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
+            //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
+            //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
+            let a = delta_swap::<0x0000ff000000ff00, 8>(a);
+            // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
+            //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
+            //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
+            delta_swap::<0x00000000ffff0000, 16>(a)
+        } else if #[cfg(target_pointer_width = "32")] {
+            // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
+            //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
+            //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
+            // Note:  0x00cc = 0b0000_0000_1100_1100
+            //   0x00cc << 6 = 0b0011_0011_0000_0000
+            let a = delta_swap::<0x00cc00cc, 6>(a);
+            // Now we swap groups of four bits (still numbering by pairs):
+            //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
+            //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
+            // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
+            delta_swap::<0x0000f0f0, 12>(a)
+        } else {
+            unimplemented!()
+        }
+    }
+}
+
+#[inline(always)]
+fn uncompact_word(a: Word) -> Word {
+    #[cfg(target_pointer_width = "64")]
+    let r = {
+        // Reverse the steps of |aes_nohw_uncompact_word|.
+        let a = delta_swap::<0x00000000ffff0000, 16>(a);
+        let a = delta_swap::<0x0000ff000000ff00, 8>(a);
+        delta_swap::<0x00f000f000f000f0, 4>(a)
+    };
+
+    #[cfg(target_pointer_width = "32")]
+    let r = {
+        let a = delta_swap::<0x0000f0f0, 12>(a);
+        delta_swap::<0x00cc00cc, 6>(a)
+    };
+
+    Word::to_le(r)
+}
+
+fn compact_block(input: &[u8; 16]) -> [Word; BLOCK_WORDS] {
+    let out: [Word; BLOCK_WORDS] = unsafe { core::mem::transmute(*input) };
+    let a0 = compact_word(out[0]);
+    let a1 = compact_word(out[1]);
+
+    #[cfg(target_pointer_width = "64")]
+    let r = [
+        (a0 & 0x00000000ffffffff) | (a1 << 32),
+        (a1 & 0xffffffff00000000) | (a0 >> 32),
+    ];
+
+    #[cfg(target_pointer_width = "32")]
+    let r = {
+        let a2 = compact_word(out[2]);
+        let a3 = compact_word(out[3]);
+        // Note clang, when building for ARM Thumb2, will sometimes miscompile
+        // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+        // without optimizations. This bug was introduced in
+        // https://reviews.llvm.org/rL340261 and fixed in
+        // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+        [
+            Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]),
+            Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]),
+            Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]),
+            Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]),
+        ]
+    };
+
+    r
+}
+
+fn uncompact_block(out: &mut [u8; BLOCK_LEN], input: &[Word; BLOCK_WORDS]) {
+    let a0 = input[0];
+    let a1 = input[1];
+
+    #[cfg(target_pointer_width = "64")]
+    let [b0, b1] = {
+        [
+            (a0 & 0x00000000ffffffff) | (a1 << 32),
+            (a1 & 0xffffffff00000000) | (a0 >> 32),
+        ]
+    };
+
+    #[cfg(target_pointer_width = "32")]
+    let [b0, b1, b2, b3] = {
+        let a2 = input[2];
+        let a3 = input[3];
+
+        // Note clang, when building for ARM Thumb2, will sometimes miscompile
+        // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
+        // without optimizations. This bug was introduced in
+        // https://reviews.llvm.org/rL340261 and fixed in
+        // https://reviews.llvm.org/rL351310. The following is written to avoid this.
+        let b0 = Word::from_le_bytes([lo(a0), lo(a1), lo(a2), lo(a3)]);
+        let b1 = Word::from_le_bytes([lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8)]);
+        let b2 = Word::from_le_bytes([lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16)]);
+        let b3 = Word::from_le_bytes([lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24)]);
+        [b0, b1, b2, b3]
+    };
+
+    let b0 = uncompact_word(b0);
+    let b1 = uncompact_word(b1);
+
+    #[cfg(target_pointer_width = "32")]
+    let (b2, b3) = (uncompact_word(b2), uncompact_word(b3));
+
+    let (out, _) = polyfill::slice::as_chunks_mut(out);
+    out[0] = Word::to_ne_bytes(b0);
+    out[1] = Word::to_ne_bytes(b1);
+
+    #[cfg(target_pointer_width = "32")]
+    {
+        out[2] = Word::to_ne_bytes(b2);
+        out[3] = Word::to_ne_bytes(b3);
+    }
+}
+
+#[cfg(target_pointer_width = "32")]
+#[inline(always)]
+fn lo(w: Word) -> u8 {
+    w as u8
+}
+
+// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
+// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
+// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
+// is repeated to the full width of |aes_word_t|.
+fn swap_bits<const A: usize, const B: usize, const MASK_BYTE: u8, const SHIFT: u8>(
+    w: &mut [Word; 8],
+) {
+    // TODO: const MASK: Word = ...
+    let mask = Word::from_ne_bytes([MASK_BYTE; core::mem::size_of::<Word>()]);
+
+    // This is a variation on a delta swap.
+    let swap = ((w[A] >> SHIFT) ^ w[B]) & mask;
+    w[A] ^= swap << SHIFT;
+    w[B] ^= swap;
+}
+
+// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
+// specified, it is in bitsliced form.
+#[repr(C)]
+struct Batch {
+    w: [Word; 8],
+}
+
+impl Batch {
+    // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
+    // |num_blocks| must be at most |AES_NOHW_BATCH|.
+    fn from_bytes(input: &[[u8; BLOCK_LEN]]) -> Self {
+        let mut r = Self {
+            w: Default::default(),
+        };
+        input.iter().enumerate().for_each(|(i, input)| {
+            let block = compact_block(input);
+            r.set(&block, i);
+        });
+        r.transpose();
+        r
+    }
+
+    // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
+    // compact form.
+    fn set(&mut self, input: &[Word; BLOCK_WORDS], i: usize) {
+        assert!(i < self.w.len());
+
+        // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
+        // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
+        // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
+        // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
+        // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
+        // will be correctly placed.)
+        cfg_if! {
+            if #[cfg(target_pointer_width = "64")] {
+                self.w[i] = input[0];
+                self.w[i + 4] = input[1];
+            } else if #[cfg(target_pointer_width = "32")] {
+                self.w[i] = input[0];
+                self.w[i + 2] = input[1];
+                self.w[i + 4] = input[2];
+                self.w[i + 6] = input[3];
+            } else {
+                todo!()
+            }
+        }
+    }
+
+    // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
+    // compact form.
+    fn get(&self, i: usize) -> [Word; BLOCK_WORDS] {
+        assert!(i < self.w.len());
+        array::from_fn(|j| {
+            #[cfg(target_pointer_width = "64")]
+            const STRIDE: usize = 4;
+            #[cfg(target_pointer_width = "32")]
+            const STRIDE: usize = 2;
+
+            self.w[i + (j * STRIDE)]
+        })
+    }
+}
+
+// AES round steps.
+impl Batch {
+    fn sub_bytes(&mut self) {
+        // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
+        let x0 = self.w[7];
+        let x1 = self.w[6];
+        let x2 = self.w[5];
+        let x3 = self.w[4];
+        let x4 = self.w[3];
+        let x5 = self.w[2];
+        let x6 = self.w[1];
+        let x7 = self.w[0];
+
+        // Figure 2, the top linear transformation.
+        let y14 = xor(x3, x5);
+        let y13 = xor(x0, x6);
+        let y9 = xor(x0, x3);
+        let y8 = xor(x0, x5);
+        let t0 = xor(x1, x2);
+        let y1 = xor(t0, x7);
+        let y4 = xor(y1, x3);
+        let y12 = xor(y13, y14);
+        let y2 = xor(y1, x0);
+        let y5 = xor(y1, x6);
+        let y3 = xor(y5, y8);
+        let t1 = xor(x4, y12);
+        let y15 = xor(t1, x5);
+        let y20 = xor(t1, x1);
+        let y6 = xor(y15, x7);
+        let y10 = xor(y15, t0);
+        let y11 = xor(y20, y9);
+        let y7 = xor(x7, y11);
+        let y17 = xor(y10, y11);
+        let y19 = xor(y10, y8);
+        let y16 = xor(t0, y11);
+        let y21 = xor(y13, y16);
+        let y18 = xor(x0, y16);
+
+        // Figure 3, the middle non-linear section.
+        let t2 = and(y12, y15);
+        let t3 = and(y3, y6);
+        let t4 = xor(t3, t2);
+        let t5 = and(y4, x7);
+        let t6 = xor(t5, t2);
+        let t7 = and(y13, y16);
+        let t8 = and(y5, y1);
+        let t9 = xor(t8, t7);
+        let t10 = and(y2, y7);
+        let t11 = xor(t10, t7);
+        let t12 = and(y9, y11);
+        let t13 = and(y14, y17);
+        let t14 = xor(t13, t12);
+        let t15 = and(y8, y10);
+        let t16 = xor(t15, t12);
+        let t17 = xor(t4, t14);
+        let t18 = xor(t6, t16);
+        let t19 = xor(t9, t14);
+        let t20 = xor(t11, t16);
+        let t21 = xor(t17, y20);
+        let t22 = xor(t18, y19);
+        let t23 = xor(t19, y21);
+        let t24 = xor(t20, y18);
+        let t25 = xor(t21, t22);
+        let t26 = and(t21, t23);
+        let t27 = xor(t24, t26);
+        let t28 = and(t25, t27);
+        let t29 = xor(t28, t22);
+        let t30 = xor(t23, t24);
+        let t31 = xor(t22, t26);
+        let t32 = and(t31, t30);
+        let t33 = xor(t32, t24);
+        let t34 = xor(t23, t33);
+        let t35 = xor(t27, t33);
+        let t36 = and(t24, t35);
+        let t37 = xor(t36, t34);
+        let t38 = xor(t27, t36);
+        let t39 = and(t29, t38);
+        let t40 = xor(t25, t39);
+        let t41 = xor(t40, t37);
+        let t42 = xor(t29, t33);
+        let t43 = xor(t29, t40);
+        let t44 = xor(t33, t37);
+        let t45 = xor(t42, t41);
+        let z0 = and(t44, y15);
+        let z1 = and(t37, y6);
+        let z2 = and(t33, x7);
+        let z3 = and(t43, y16);
+        let z4 = and(t40, y1);
+        let z5 = and(t29, y7);
+        let z6 = and(t42, y11);
+        let z7 = and(t45, y17);
+        let z8 = and(t41, y10);
+        let z9 = and(t44, y12);
+        let z10 = and(t37, y3);
+        let z11 = and(t33, y4);
+        let z12 = and(t43, y13);
+        let z13 = and(t40, y5);
+        let z14 = and(t29, y2);
+        let z15 = and(t42, y9);
+        let z16 = and(t45, y14);
+        let z17 = and(t41, y8);
+
+        // Figure 4, bottom linear transformation.
+        let t46 = xor(z15, z16);
+        let t47 = xor(z10, z11);
+        let t48 = xor(z5, z13);
+        let t49 = xor(z9, z10);
+        let t50 = xor(z2, z12);
+        let t51 = xor(z2, z5);
+        let t52 = xor(z7, z8);
+        let t53 = xor(z0, z3);
+        let t54 = xor(z6, z7);
+        let t55 = xor(z16, z17);
+        let t56 = xor(z12, t48);
+        let t57 = xor(t50, t53);
+        let t58 = xor(z4, t46);
+        let t59 = xor(z3, t54);
+        let t60 = xor(t46, t57);
+        let t61 = xor(z14, t57);
+        let t62 = xor(t52, t58);
+        let t63 = xor(t49, t58);
+        let t64 = xor(z4, t59);
+        let t65 = xor(t61, t62);
+        let t66 = xor(z1, t63);
+        let s0 = xor(t59, t63);
+        let s6 = xor(t56, not(t62));
+        let s7 = xor(t48, not(t60));
+        let t67 = xor(t64, t65);
+        let s3 = xor(t53, t66);
+        let s4 = xor(t51, t66);
+        let s5 = xor(t47, t65);
+        let s1 = xor(t64, not(s3));
+        let s2 = xor(t55, not(t67));
+
+        self.w[0] = s7;
+        self.w[1] = s6;
+        self.w[2] = s5;
+        self.w[3] = s4;
+        self.w[4] = s3;
+        self.w[5] = s2;
+        self.w[6] = s1;
+        self.w[7] = s0;
+    }
+
+    fn add_round_key(&mut self, key: &Batch) {
+        constant_time::xor_assign_at_start(&mut self.w, &key.w)
+    }
+
+    #[inline(always)]
+    fn rotate_cols_right<const N_TIMES_4: u32, const BLOCK_LEN_MINUS_N_TIMES_4: u32>(
+        v: Word,
+    ) -> Word {
+        or(
+            shift_right::<N_TIMES_4>(v),
+            shift_left::<BLOCK_LEN_MINUS_N_TIMES_4>(v),
+        )
+    }
+}
+
+// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
+// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
+// constant shift counts in the SSE2 implementation.
+// TODO(MSRV feature(generic_const_exprs)): Replace this.
+macro_rules! rotate_cols_right {
+    ( Self::rotate_cols_right::<$N:literal>($v:expr) ) => {
+        Self::rotate_cols_right::<{ $N * 4 }, { 16 - ($N * 4) }>($v)
+    };
+}
+
+impl Batch {
+    fn shift_rows(&mut self) {
+        self.w.iter_mut().for_each(|w| {
+            let row0 = and(*w, ROW0_MASK);
+            let row1 = and(*w, ROW1_MASK);
+            let row2 = and(*w, ROW2_MASK);
+            let row3 = and(*w, ROW3_MASK);
+            let row1 = rotate_cols_right!(Self::rotate_cols_right::<1>(row1));
+            let row2 = rotate_cols_right!(Self::rotate_cols_right::<2>(row2));
+            let row3 = rotate_cols_right!(Self::rotate_cols_right::<3>(row3));
+            *w = or(or(row0, row1), or(row2, row3));
+        });
+    }
+
+    fn mix_columns(&mut self) {
+        // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
+        let a0 = self.w[0];
+        let a1 = self.w[1];
+        let a2 = self.w[2];
+        let a3 = self.w[3];
+        let a4 = self.w[4];
+        let a5 = self.w[5];
+        let a6 = self.w[6];
+        let a7 = self.w[7];
+
+        let r0 = rotate_rows_down(a0);
+        let a0_r0 = xor(a0, r0);
+        let r1 = rotate_rows_down(a1);
+        let a1_r1 = xor(a1, r1);
+        let r2 = rotate_rows_down(a2);
+        let a2_r2 = xor(a2, r2);
+        let r3 = rotate_rows_down(a3);
+        let a3_r3 = xor(a3, r3);
+        let r4 = rotate_rows_down(a4);
+        let a4_r4 = xor(a4, r4);
+        let r5 = rotate_rows_down(a5);
+        let a5_r5 = xor(a5, r5);
+        let r6 = rotate_rows_down(a6);
+        let a6_r6 = xor(a6, r6);
+        let r7 = rotate_rows_down(a7);
+        let a7_r7 = xor(a7, r7);
+
+        self.w[0] = xor(xor(a7_r7, r0), rotate_rows_twice(a0_r0));
+        self.w[1] = xor(xor(a0_r0, a7_r7), xor(r1, rotate_rows_twice(a1_r1)));
+        self.w[2] = xor(xor(a1_r1, r2), rotate_rows_twice(a2_r2));
+        self.w[3] = xor(xor(a2_r2, a7_r7), xor(r3, rotate_rows_twice(a3_r3)));
+        self.w[4] = xor(xor(a3_r3, a7_r7), xor(r4, rotate_rows_twice(a4_r4)));
+        self.w[5] = xor(xor(a4_r4, r5), rotate_rows_twice(a5_r5));
+        self.w[6] = xor(xor(a5_r5, r6), rotate_rows_twice(a6_r6));
+        self.w[7] = xor(xor(a6_r6, r7), rotate_rows_twice(a7_r7));
+    }
+
+    // aes_nohw_from_batch writes the first |num_blocks| blocks in |batch| to |out|.
+    // |num_blocks| must be at most |AES_NOHW_BATCH|.
+    pub fn into_bytes(self, out: &mut [[u8; BLOCK_LEN]]) {
+        assert!(out.len() <= BATCH_SIZE);
+
+        // TODO: Why did the original code copy `self`?
+        let mut copy = self;
+        copy.transpose();
+        out.iter_mut().enumerate().for_each(|(i, out)| {
+            let block = copy.get(i);
+            uncompact_block(out, &block);
+        });
+    }
+
+    fn encrypt(mut self, key: &Schedule, rounds: usize, out: &mut [[u8; BLOCK_LEN]]) {
+        assert!(out.len() <= BATCH_SIZE);
+        self.add_round_key(&key.keys[0]);
+        key.keys[1..rounds].iter().for_each(|key| {
+            self.sub_bytes();
+            self.shift_rows();
+            self.mix_columns();
+            self.add_round_key(key);
+        });
+        self.sub_bytes();
+        self.shift_rows();
+        self.add_round_key(&key.keys[rounds]);
+        self.into_bytes(out);
+    }
+
+    // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
+    // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
+    // and transposes each square.
+    fn transpose(&mut self) {
+        const _: () = assert!(BATCH_SIZE == 2 || BATCH_SIZE == 4);
+
+        // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
+        swap_bits::<0, 1, 0x55, 1>(&mut self.w);
+        swap_bits::<2, 3, 0x55, 1>(&mut self.w);
+        swap_bits::<4, 5, 0x55, 1>(&mut self.w);
+        swap_bits::<6, 7, 0x55, 1>(&mut self.w);
+
+        if BATCH_SIZE >= 4 {
+            // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
+            swap_bits::<0, 2, 0x33, 2>(&mut self.w);
+            swap_bits::<1, 3, 0x33, 2>(&mut self.w);
+            swap_bits::<4, 6, 0x33, 2>(&mut self.w);
+            swap_bits::<5, 7, 0x33, 2>(&mut self.w);
+        }
+    }
+}
+
+#[inline(always)]
+fn rotate_rows_down(v: Word) -> Word {
+    #[cfg(target_pointer_width = "64")]
+    {
+        ((v >> 4) & 0x0fff0fff0fff0fff) | ((v << 12) & 0xf000f000f000f000)
+    }
+
+    #[cfg(target_pointer_width = "32")]
+    {
+        ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0)
+    }
+}
+
+// rotate_rows_twice returns |v| with the rows in each column rotated
+// by two.
+#[inline(always)]
+fn rotate_rows_twice(v: Word) -> Word {
+    #[cfg(target_pointer_width = "64")]
+    {
+        ((v >> 8) & 0x00ff00ff00ff00ff) | ((v << 8) & 0xff00ff00ff00ff00)
+    }
+
+    #[cfg(target_pointer_width = "32")]
+    {
+        ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0)
+    }
+}
+
+// Key schedule.
+
+// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
+// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
+// |AES_KEY|s so it should not be used as a long-term key representation.
+struct Schedule {
+    // keys is an array of batches, one for each round key. Each batch stores
+    // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
+    keys: [Batch; MAX_ROUNDS + 1],
+}
+
+impl Schedule {
+    fn expand_round_keys(key: &AES_KEY) -> Self {
+        Self {
+            keys: array::from_fn(|i| {
+                let tmp: [Word; BLOCK_WORDS] = unsafe { core::mem::transmute(key.rd_key[i]) };
+
+                let mut r = Batch { w: [0; 8] };
+                // Copy the round key into each block in the batch.
+                for j in 0..BATCH_SIZE {
+                    r.set(&tmp, j);
+                }
+                r.transpose();
+                r
+            }),
+        }
+    }
+}
+
+static RCON: [u8; 10] = [0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36];
+
+// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
+// |rcon|, stored in a |aes_word_t|.
+#[inline(always)]
+fn rcon_slice(rcon: u8, i: usize) -> Word {
+    let rcon = (rcon >> (i * BATCH_SIZE)) & ((1 << BATCH_SIZE) - 1);
+    rcon.into()
+}
+
+pub(super) fn set_encrypt_key(key: &mut AES_KEY, bytes: KeyBytes) {
+    match bytes {
+        KeyBytes::AES_128(bytes) => setup_key_128(key, bytes),
+        KeyBytes::AES_256(bytes) => setup_key_256(key, bytes),
+    }
+}
+
+fn setup_key_128(key: &mut AES_KEY, input: &[u8; 128 / 8]) {
+    key.rounds = 10;
+
+    let mut block = compact_block(input);
+    key.rd_key[0] = unsafe { core::mem::transmute(block) };
+
+    key.rd_key[1..=10]
+        .iter_mut()
+        .zip(RCON)
+        .for_each(|(rd_key, rcon)| {
+            let sub = sub_block(&block);
+            *rd_key = derive_round_key(&mut block, sub, rcon);
+        });
+}
+
+pub(super) fn encrypt_block(key: &AES_KEY, in_out: &mut [u8; BLOCK_LEN]) {
+    let sched = Schedule::expand_round_keys(key);
+    let batch = Batch::from_bytes(core::slice::from_ref(in_out));
+    batch.encrypt(&sched, usize_from_u32(key.rounds), array::from_mut(in_out));
+}
+
+fn setup_key_256(key: &mut AES_KEY, input: &[u8; 32]) {
+    key.rounds = 14;
+
+    // Each key schedule iteration produces two round keys.
+    let (input, _) = polyfill::slice::as_chunks(input);
+    let mut block1 = compact_block(&input[0]);
+    key.rd_key[0] = unsafe { core::mem::transmute(block1) };
+    let mut block2 = compact_block(&input[1]);
+    key.rd_key[1] = unsafe { core::mem::transmute(block2) };
+
+    key.rd_key[2..=14]
+        .chunks_mut(2)
+        .zip(RCON)
+        .for_each(|(rd_key_pair, rcon)| {
+            let sub = sub_block(&block2);
+            rd_key_pair[0] = derive_round_key(&mut block1, sub, rcon);
+
+            if let Some(rd_key_2) = rd_key_pair.get_mut(1) {
+                let sub = sub_block(&block1);
+                block2.iter_mut().zip(sub).for_each(|(w, sub)| {
+                    // Incorporate the transformed word into the first word.
+                    *w ^= shift_right::<12>(sub);
+                    // Propagate to the remaining words.
+                    let v = *w;
+                    *w ^= shift_left::<4>(v);
+                    *w ^= shift_left::<8>(v);
+                    *w ^= shift_left::<12>(v);
+                });
+                *rd_key_2 = unsafe { core::mem::transmute(block2) };
+            }
+        });
+}
+
+fn derive_round_key(
+    block: &mut [Word; BLOCK_WORDS],
+    sub: [Word; BLOCK_WORDS],
+    rcon: u8,
+) -> [u32; 4] {
+    block
+        .iter_mut()
+        .zip(sub)
+        .enumerate()
+        .for_each(|(j, (w, sub))| {
+            // Incorporate |rcon| and the transformed word into the first word.
+            *w ^= rcon_slice(rcon, j);
+            *w ^= shift_right::<12>(rotate_rows_down(sub));
+            // Propagate to the remaining words.
+            let v = *w;
+            *w ^= shift_left::<4>(v);
+            *w ^= shift_left::<8>(v);
+            *w ^= shift_left::<12>(v);
+        });
+    unsafe { core::mem::transmute(*block) }
+}
+
+fn sub_block(input: &[Word; BLOCK_WORDS]) -> [Word; BLOCK_WORDS] {
+    let mut batch = Batch {
+        w: Default::default(),
+    };
+    batch.set(input, 0);
+    batch.transpose();
+    batch.sub_bytes();
+    batch.transpose();
+    batch.get(0)
+}
+
+pub(super) fn ctr32_encrypt_within(
+    key: &AES_KEY,
+    mut in_out: &mut [u8],
+    src: RangeFrom<usize>,
+    ctr: &mut Counter,
+) {
+    let (input, leftover): (&[[u8; BLOCK_LEN]], _) =
+        polyfill::slice::as_chunks(&in_out[src.clone()]);
+    debug_assert_eq!(leftover.len(), 0);
+    if input.is_empty() {
+        return;
+    }
+    let blocks_u32 = u32::try_from(input.len()).unwrap();
+
+    let sched = Schedule::expand_round_keys(key);
+
+    let initial_ctr = ctr.as_bytes_less_safe();
+    ctr.increment_by_less_safe(blocks_u32);
+
+    let mut ivs = [initial_ctr; BATCH_SIZE];
+    let mut enc_ctrs = [[0u8; 16]; BATCH_SIZE];
+    let initial_ctr: [[u8; 4]; 4] = initial_ctr.array_split_map(|x| x);
+    let mut ctr = u32::from_be_bytes(initial_ctr[3]);
+
+    for _ in (0..).step_by(BATCH_SIZE) {
+        (0u32..).zip(ivs.iter_mut()).for_each(|(i, iv)| {
+            iv[12..].copy_from_slice(&u32::to_be_bytes(ctr + i));
+        });
+
+        let (input, leftover): (&[[u8; BLOCK_LEN]], _) =
+            polyfill::slice::as_chunks(&in_out[src.clone()]);
+        debug_assert_eq!(leftover.len(), 0);
+        let todo = core::cmp::min(ivs.len(), input.len());
+        let batch = Batch::from_bytes(&ivs[..todo]);
+        batch.encrypt(&sched, usize_from_u32(key.rounds), &mut enc_ctrs[..todo]);
+        constant_time::xor_within_chunked_at_start(in_out, src.clone(), &enc_ctrs[..todo]);
+
+        if todo < BATCH_SIZE {
+            break;
+        }
+        in_out = &mut in_out[(BLOCK_LEN * BATCH_SIZE)..];
+        ctr += BATCH_SIZE_U32;
+    }
+}
diff --git a/src/constant_time.rs b/src/constant_time.rs
index dd18463b5e..65918e2c4a 100644
--- a/src/constant_time.rs
+++ b/src/constant_time.rs
@@ -14,7 +14,14 @@
 
 //! Constant-time operations.
 
-use crate::{c, error};
+use crate::{c, error, polyfill};
+use core::{cmp, ops::RangeFrom};
+
+#[cfg(target_pointer_width = "64")]
+pub(crate) type Word = u64;
+
+#[cfg(target_pointer_width = "32")]
+pub(crate) type Word = u32;
 
 /// Returns `Ok(())` if `a == b` and `Err(error::Unspecified)` otherwise.
 /// The comparison of `a` and `b` is done in constant time with respect to the
@@ -45,13 +52,54 @@ pub(crate) fn xor<const N: usize>(mut a: [u8; N], b: [u8; N]) -> [u8; N] {
 /// XORs the first N bytes of `b` into `a`, where N is
 /// `core::cmp::min(a.len(), b.len())`.
 #[inline(always)]
-pub(crate) fn xor_assign_at_start<'a>(
+pub(crate) fn xor_assign_at_start_bytes<'a>(
     a: impl IntoIterator<Item = &'a mut u8>,
     b: impl IntoIterator<Item = &'a u8>,
 ) {
     a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b);
 }
 
+/// XORs the first N words of `b` into `a`, where N is
+/// `core::cmp::min(a.len(), b.len())`.
+#[inline(always)]
+pub(crate) fn xor_assign_at_start<'a>(
+    a: impl IntoIterator<Item = &'a mut Word>,
+    b: impl IntoIterator<Item = &'a Word>,
+) {
+    a.into_iter().zip(b).for_each(|(a, b)| *a ^= *b);
+}
+#[inline(always)]
+pub(crate) fn xor_within_chunked_at_start<const INNER: usize>(
+    in_out: &mut [u8],
+    src: RangeFrom<usize>,
+    b: &[[u8; INNER]],
+) {
+    let (mut input, num_blocks) = {
+        let input = match in_out.get(src.clone()) {
+            Some(input) => input,
+            None => {
+                panic!()
+            }
+        };
+
+        let (input, _): (&[[u8; INNER]], _) = polyfill::slice::as_chunks(input);
+        let num_blocks = cmp::min(input.len(), b.len());
+        (input.as_ptr(), num_blocks)
+    };
+    let (output, _): (&mut [[u8; INNER]], _) = polyfill::slice::as_chunks_mut(in_out);
+    let output = &mut output[..num_blocks];
+
+    for (b, out) in (b[..num_blocks].iter()).zip(output) {
+        let a = unsafe { core::ptr::read(input) };
+        out.iter_mut()
+            .zip(a.iter().zip(b))
+            .for_each(|(out, (a, b))| {
+                *out = *a ^ *b;
+            });
+        input = unsafe { input.add(1) };
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/hmac.rs b/src/hmac.rs
index 34984d62aa..a710f5c65f 100644
--- a/src/hmac.rs
+++ b/src/hmac.rs
@@ -234,7 +234,7 @@ impl Key {
         // If the key is shorter than one block then we're supposed to act like
         // it is padded with zero bytes up to the block length. `x ^ 0 == x` so
         // we can just leave the trailing bytes of `padded_key` untouched.
-        constant_time::xor_assign_at_start(&mut padded_key[..], key_value);
+        constant_time::xor_assign_at_start_bytes(&mut padded_key[..], key_value);
 
         let leftover = key.inner.update(padded_key, cpu_features);
         debug_assert_eq!(leftover.len(), 0);
diff --git a/src/limb.rs b/src/limb.rs
index 1fd6c27b5a..b18f929bab 100644
--- a/src/limb.rs
+++ b/src/limb.rs
@@ -21,34 +21,20 @@
 use crate::{c, error, polyfill::ArrayFlatMap};
 
 #[cfg(any(test, feature = "alloc"))]
-use crate::bits;
+use crate::{bits, constant_time, polyfill::usize_from_u32};
 
 #[cfg(feature = "alloc")]
 use core::num::Wrapping;
 
 // XXX: Not correct for x32 ABIs.
-#[cfg(target_pointer_width = "64")]
-pub type Limb = u64;
-#[cfg(target_pointer_width = "32")]
-pub type Limb = u32;
-#[cfg(target_pointer_width = "64")]
-pub const LIMB_BITS: usize = 64;
-#[cfg(target_pointer_width = "32")]
-pub const LIMB_BITS: usize = 32;
-
-#[cfg(target_pointer_width = "64")]
-#[derive(Debug, PartialEq)]
-#[repr(u64)]
-pub enum LimbMask {
-    True = 0xffff_ffff_ffff_ffff,
-    False = 0,
-}
+pub type Limb = constant_time::Word;
+pub const LIMB_BITS: usize = usize_from_u32(Limb::BITS);
 
-#[cfg(target_pointer_width = "32")]
+#[cfg_attr(target_pointer_width = "64", repr(u64))]
+#[cfg_attr(target_pointer_width = "32", repr(u32))]
 #[derive(Debug, PartialEq)]
-#[repr(u32)]
 pub enum LimbMask {
-    True = 0xffff_ffff,
+    True = Limb::MAX,
     False = 0,
 }
 
diff --git a/src/pbkdf2.rs b/src/pbkdf2.rs
index 5a25f5d7f6..d5240fe182 100644
--- a/src/pbkdf2.rs
+++ b/src/pbkdf2.rs
@@ -189,7 +189,7 @@ fn derive_block(secret: &hmac::Key, iterations: NonZeroU32, salt: &[u8], idx: u3
 
     let mut remaining: u32 = iterations.into();
     loop {
-        constant_time::xor_assign_at_start(&mut out[..], u.as_ref());
+        constant_time::xor_assign_at_start_bytes(&mut out[..], u.as_ref());
 
         if remaining == 1 {
             break;
diff --git a/src/polyfill.rs b/src/polyfill.rs
index f09563c0d9..39296fc086 100644
--- a/src/polyfill.rs
+++ b/src/polyfill.rs
@@ -22,7 +22,7 @@ pub const fn u64_from_usize(x: usize) -> u64 {
 }
 
 #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
-pub fn usize_from_u32(x: u32) -> usize {
+pub const fn usize_from_u32(x: u32) -> usize {
     x as usize
 }
 
diff --git a/src/rsa/padding.rs b/src/rsa/padding.rs
index 2fe7dda575..d544d5e852 100644
--- a/src/rsa/padding.rs
+++ b/src/rsa/padding.rs
@@ -74,7 +74,7 @@ fn mgf1(digest_alg: &'static digest::Algorithm, seed: &[u8], out: &mut [u8]) {
 
         // The last chunk may legitimately be shorter than `digest`, but
         // `digest` will never be shorter than `out`.
-        constant_time::xor_assign_at_start(out, digest.as_ref());
+        constant_time::xor_assign_at_start_bytes(out, digest.as_ref());
     }
 }
 
diff --git a/src/rsa/padding/pss.rs b/src/rsa/padding/pss.rs
index 35fc82be7c..4c4d048894 100644
--- a/src/rsa/padding/pss.rs
+++ b/src/rsa/padding/pss.rs
@@ -159,7 +159,7 @@ impl Verification for PSS {
             // Step 8.
             let db_rest = &mut db[1..];
             let masked_bytes = masked_bytes.read_bytes(db_rest.len())?;
-            constant_time::xor_assign_at_start(db_rest, masked_bytes.as_slice_less_safe());
+            constant_time::xor_assign_at_start_bytes(db_rest, masked_bytes.as_slice_less_safe());
             Ok(())
         })?;