From d8cd1d1c0115e2e3b06432f30451aa18b912719c Mon Sep 17 00:00:00 2001
From: Andy Leiserson <aleiserson@mozilla.com>
Date: Wed, 3 Jan 2024 16:48:06 -0800
Subject: [PATCH] Vectorization for prime fields

Semi-honest AdditiveShare can hold a vector of sharings instead of just
one sharing. The semi-honest multiply can operate on these vectors.
---
 ipa-core/benches/ct/arithmetic_circuit.rs     |  57 +++-
 ipa-core/benches/iai/arithmetic_circuit.rs    |   4 +-
 .../benches/oneshot/arithmetic_circuit.rs     |   4 +-
 ipa-core/src/ff/boolean.rs                    |  15 +-
 ipa-core/src/ff/boolean_array.rs              |  20 +-
 ipa-core/src/ff/curve_points.rs               |   6 +-
 ipa-core/src/ff/ec_prime_field.rs             |  12 +-
 ipa-core/src/ff/field.rs                      |   7 +-
 ipa-core/src/ff/galois_field.rs               |  38 ++-
 ipa-core/src/ff/mod.rs                        |   2 +-
 ipa-core/src/ff/prime_field.rs                |  20 +-
 .../src/helpers/buffers/ordering_sender.rs    |  21 +-
 ipa-core/src/helpers/gateway/send.rs          |   9 +-
 .../src/helpers/gateway/stall_detection.rs    |   3 +-
 ipa-core/src/protocol/basics/mul/mod.rs       |  15 +-
 .../src/protocol/basics/mul/semi_honest.rs    | 200 +++++++++--
 ipa-core/src/protocol/basics/mul/sparse.rs    |  24 +-
 .../modulus_conversion/convert_shares.rs      |   2 +-
 ipa-core/src/secret_sharing/array.rs          | 315 ++++++++++++++++++
 ipa-core/src/secret_sharing/mod.rs            | 208 +++++++++++-
 .../replicated/semi_honest/additive_share.rs  | 257 +++++++++-----
 ipa-core/src/test_fixture/circuit.rs          |  48 ++-
 ipa-core/src/test_fixture/mod.rs              |   2 +-
 ipa-core/src/test_fixture/sharing.rs          |  38 ++-
 24 files changed, 1148 insertions(+), 179 deletions(-)
 create mode 100644 ipa-core/src/secret_sharing/array.rs
diff --git a/ipa-core/benches/ct/arithmetic_circuit.rs b/ipa-core/benches/ct/arithmetic_circuit.rs
index 1c11e8dd2..fac67dca9 100644
--- a/ipa-core/benches/ct/arithmetic_circuit.rs
+++ b/ipa-core/benches/ct/arithmetic_circuit.rs
@@ -1,8 +1,38 @@
 use criterion::{
-    black_box, criterion_group, criterion_main, BenchmarkId, Criterion, SamplingMode, Throughput,
+    black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup,
+    BenchmarkId, Criterion, SamplingMode, Throughput,
 };
-use ipa_core::{ff::Fp31, test_fixture::circuit};
-use tokio::runtime::Builder;
+use ipa_core::{
+    ff::{Field, Fp31, Fp32BitPrime},
+    protocol::{basics::SecureMul, context::SemiHonestContext},
+    secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, IntoShares},
+    test_fixture::circuit,
+};
+use rand::distributions::{Distribution, Standard};
+use tokio::runtime::{Builder, Runtime};
+
+fn do_benchmark<M, F, const N: usize>(
+    rt: &Runtime,
+    group: &mut BenchmarkGroup<M>,
+    width: u32,
+    depth: u16,
+) where
+    M: Measurement,
+    F: Field + FieldSimd<N>,
+    for<'a> Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
+    [F; N]: IntoShares<Replicated<F, N>>,
+    Standard: Distribution<F>,
+{
+    group.throughput(Throughput::Elements((width * depth as u32) as u64));
+    group.bench_with_input(
+        BenchmarkId::new("circuit", format!("{width}:{depth}:{}x{}", F::NAME, N)),
+        &(width, depth),
+        |b, &(width, depth)| {
+            b.to_async(rt)
+                .iter(|| circuit::arithmetic::<F, N>(black_box(width), black_box(depth)));
+        },
+    );
+}
 
 pub fn criterion_benchmark(c: &mut Criterion) {
     let rt = Builder::new_multi_thread()
@@ -16,19 +46,14 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     group.sample_size(10);
     group.sampling_mode(SamplingMode::Flat);
 
-    for width in [5_000u32, 50_000, 500_000, 1_000_000] {
-        for depth in [1u8, 10, 64] {
-            group.throughput(Throughput::Elements((width * depth as u32) as u64));
-            group.bench_with_input(
-                BenchmarkId::new("circuit", format!("{width}:{depth}")),
-                &(width, depth),
-                |b, &(width, depth)| {
-                    b.to_async(&rt)
-                        .iter(|| circuit::arithmetic::<Fp31>(black_box(width), black_box(depth)));
-                },
-            );
-        }
-    }
+    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 512_000, 1);
+    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 51_200, 10);
+    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 8_000, 64);
+
+    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 25_600, 10);
+    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 2_560, 100);
+    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_000, 64);
+    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 250, 1_024);
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/ipa-core/benches/iai/arithmetic_circuit.rs b/ipa-core/benches/iai/arithmetic_circuit.rs
index af4cc3c2c..ef43e70a0 100644
--- a/ipa-core/benches/iai/arithmetic_circuit.rs
+++ b/ipa-core/benches/iai/arithmetic_circuit.rs
@@ -10,10 +10,10 @@ pub fn iai_benchmark() {
         .expect("Creating runtime failed");
 
     const CIRCUIT_WIDTH: u32 = 500_000;
-    const CIRCUIT_DEPTH: u8 = 1;
+    const CIRCUIT_DEPTH: u16 = 1;
 
     rt.block_on(async {
-        circuit::arithmetic::<Fp31>(black_box(CIRCUIT_WIDTH), black_box(CIRCUIT_DEPTH)).await;
+        circuit::arithmetic::<Fp31, 1>(black_box(CIRCUIT_WIDTH), black_box(CIRCUIT_DEPTH)).await;
     })
 }
 
diff --git a/ipa-core/benches/oneshot/arithmetic_circuit.rs b/ipa-core/benches/oneshot/arithmetic_circuit.rs
index e4fdeceab..c78a409ea 100644
--- a/ipa-core/benches/oneshot/arithmetic_circuit.rs
+++ b/ipa-core/benches/oneshot/arithmetic_circuit.rs
@@ -14,7 +14,7 @@ pub struct CircuitArgs {
     pub width: u32,
 
     #[arg(short, long, help = "depth of the circuit", default_value_t = 10)]
-    pub depth: u8,
+    pub depth: u16,
 
     /// Cargo passes the bench argument
     /// https://doc.rust-lang.org/cargo/commands/cargo-bench.html
@@ -34,7 +34,7 @@ pub async fn main() {
     }
 
     let start = Instant::now();
-    circuit::arithmetic::<Fp31>(args.width, args.depth).await;
+    circuit::arithmetic::<Fp31, 1>(args.width, args.depth).await;
     let duration = start.elapsed().as_secs_f32();
 
     println!("benchmark complete after {duration}s");
diff --git a/ipa-core/src/ff/boolean.rs b/ipa-core/src/ff/boolean.rs
index 6937dd1a9..f577c5ac1 100644
--- a/ipa-core/src/ff/boolean.rs
+++ b/ipa-core/src/ff/boolean.rs
@@ -5,7 +5,10 @@ use super::Gf32Bit;
 use crate::{
     ff::{Field, Serializable},
     protocol::prss::FromRandomU128,
-    secret_sharing::{replicated::malicious::ExtendableField, Block, SharedValue},
+    secret_sharing::{
+        replicated::malicious::ExtendableField, Block, FieldVectorizable, SharedValue, StdArray,
+        Vectorizable,
+    },
 };
 
 impl Block for bool {
@@ -40,6 +43,14 @@ impl SharedValue for Boolean {
     const ZERO: Self = Self(false);
 }
 
+impl Vectorizable<1> for Boolean {
+    type Array = StdArray<Boolean, 1>;
+}
+
+impl FieldVectorizable<1> for Boolean {
+    type ArrayAlias = StdArray<Boolean, 1>;
+}
+
 ///conversion to Scalar struct of `curve25519_dalek`
 impl From<Boolean> for bool {
     fn from(s: Boolean) -> Self {
@@ -146,6 +157,8 @@ impl From<bool> for Boolean {
 
 ///implement Field because required by PRSS
 impl Field for Boolean {
+    const NAME: &'static str = "Boolean";
+
     const ONE: Boolean = Boolean(true);
 
     fn as_u128(&self) -> u128 {
diff --git a/ipa-core/src/ff/boolean_array.rs b/ipa-core/src/ff/boolean_array.rs
index 9d83b1286..e733264f7 100644
--- a/ipa-core/src/ff/boolean_array.rs
+++ b/ipa-core/src/ff/boolean_array.rs
@@ -8,7 +8,7 @@ use typenum::{U14, U2, U32, U8};
 use crate::{
     ff::{boolean::Boolean, ArrayAccess, Field, Serializable},
     protocol::prss::{FromRandom, FromRandomU128},
-    secret_sharing::{Block, SharedValue},
+    secret_sharing::{Block, FieldVectorizable, SharedValue, StdArray, Vectorizable},
 };
 
 /// The implementation below cannot be constrained without breaking Rust's
@@ -42,6 +42,12 @@ impl<'a> Iterator for BAIterator<'a> {
     }
 }
 
+impl<'a> ExactSizeIterator for BAIterator<'a> {
+    fn len(&self) -> usize {
+        self.iterator.len()
+    }
+}
+
 /// A value of ONE has a one in the first element of the bit array, followed by `$bits-1` zeros.
 /// This macro uses a bit of recursive repetition to produce those zeros.
 ///
@@ -95,6 +101,8 @@ macro_rules! boolean_array_impl_small {
 
         // TODO(812): remove this impl; BAs are not field elements.
         impl Field for $name {
+            const NAME: &'static str = stringify!($name);
+
             const ONE: Self = Self(bitarr_one!($bits));
 
             fn as_u128(&self) -> u128 {
@@ -153,6 +161,10 @@ macro_rules! boolean_array_impl_small {
                 Field::truncate_from(src)
             }
         }
+
+        impl FieldVectorizable<1> for $name {
+            type ArrayAlias = StdArray<$name, 1>;
+        }
     };
 }
 
@@ -358,6 +370,10 @@ macro_rules! boolean_array_impl {
                 }
             }
 
+            impl Vectorizable<1> for $name {
+                type Array = StdArray<$name, 1>;
+            }
+
             impl std::ops::Mul for $name {
                 type Output = Self;
                 fn mul(self, rhs: Self) -> Self::Output {
@@ -394,7 +410,7 @@ macro_rules! boolean_array_impl {
             #[allow(clippy::into_iter_without_iter)]
             impl<'a> IntoIterator for &'a AdditiveShare<$name> {
                 type Item = AdditiveShare<Boolean>;
-                type IntoIter = ASIterator<BAIterator<'a>>;
+                type IntoIter = ASIterator<'a, $name>;
 
                 fn into_iter(self) -> Self::IntoIter {
                     self.iter()
diff --git a/ipa-core/src/ff/curve_points.rs b/ipa-core/src/ff/curve_points.rs
index 499845f7b..f0db75b49 100644
--- a/ipa-core/src/ff/curve_points.rs
+++ b/ipa-core/src/ff/curve_points.rs
@@ -7,7 +7,7 @@ use typenum::U32;
 
 use crate::{
     ff::{ec_prime_field::Fp25519, Serializable},
-    secret_sharing::{Block, SharedValue},
+    secret_sharing::{Block, SharedValue, StdArray, Vectorizable},
 };
 
 impl Block for CompressedRistretto {
@@ -35,6 +35,10 @@ impl SharedValue for RP25519 {
     const ZERO: Self = Self(CompressedRistretto([0_u8; 32]));
 }
 
+impl Vectorizable<1> for RP25519 {
+    type Array = StdArray<Self, 1>;
+}
+
 #[derive(thiserror::Error, Debug)]
 #[error("{0:?} is not the canonical encoding of a Ristretto point.")]
 pub struct NonCanonicalEncoding(CompressedRistretto);
diff --git a/ipa-core/src/ff/ec_prime_field.rs b/ipa-core/src/ff/ec_prime_field.rs
index 0e72024ab..4c03a6a6f 100644
--- a/ipa-core/src/ff/ec_prime_field.rs
+++ b/ipa-core/src/ff/ec_prime_field.rs
@@ -9,7 +9,7 @@ use typenum::U32;
 use crate::{
     ff::{boolean_array::BA256, Field, Serializable},
     protocol::prss::FromRandomU128,
-    secret_sharing::{Block, SharedValue},
+    secret_sharing::{Block, FieldVectorizable, SharedValue, StdArray, Vectorizable},
 };
 
 impl Block for Scalar {
@@ -176,8 +176,18 @@ macro_rules! sc_hash_impl {
 #[cfg(test)]
 sc_hash_impl!(u64);
 
+impl Vectorizable<1> for Fp25519 {
+    type Array = StdArray<Self, 1>;
+}
+
+impl FieldVectorizable<1> for Fp25519 {
+    type ArrayAlias = StdArray<Self, 1>;
+}
+
 ///implement Field because required by PRSS
 impl Field for Fp25519 {
+    const NAME: &'static str = "Fp25519";
+
     const ONE: Fp25519 = Fp25519::ONE;
 
     ///both following methods are based on hashing and do not allow to actually convert elements in Fp25519
diff --git a/ipa-core/src/ff/field.rs b/ipa-core/src/ff/field.rs
index 5535ed833..4e098cfda 100644
--- a/ipa-core/src/ff/field.rs
+++ b/ipa-core/src/ff/field.rs
@@ -8,7 +8,7 @@ use typenum::{U1, U4};
 use crate::{
     error,
     protocol::prss::FromRandomU128,
-    secret_sharing::{Block, SharedValue},
+    secret_sharing::{Block, FieldVectorizable, SharedValue, Vectorizable},
 };
 
 impl Block for u8 {
@@ -29,7 +29,12 @@ pub trait Field:
     + FromRandomU128
     + TryFrom<u128, Error = error::Error>
     + Into<Self::Storage>
+    + Vectorizable<1>
+    + FieldVectorizable<1, ArrayAlias = <Self as Vectorizable<1>>::Array>
 {
+    // Name of the field
+    const NAME: &'static str;
+
     /// Multiplicative identity element
     const ONE: Self;
 
diff --git a/ipa-core/src/ff/galois_field.rs b/ipa-core/src/ff/galois_field.rs
index fb7c9ae0a..64a345f6f 100644
--- a/ipa-core/src/ff/galois_field.rs
+++ b/ipa-core/src/ff/galois_field.rs
@@ -15,7 +15,7 @@ use crate::{
     ff::{boolean_array::NonZeroPadding, Field, Serializable},
     impl_serializable_trait,
     protocol::prss::FromRandomU128,
-    secret_sharing::{Block, SharedValue},
+    secret_sharing::{Block, FieldVectorizable, SharedValue, Vectorizable},
 };
 
 /// Trait for data types storing arbitrary number of bits.
@@ -148,6 +148,12 @@ impl<'a> Iterator for BoolIterator<'a> {
     }
 }
 
+impl<'a> ExactSizeIterator for BoolIterator<'a> {
+    fn len(&self) -> usize {
+        self.0.len()
+    }
+}
+
 macro_rules! bit_array_impl {
     ( $modname:ident, $name:ident, $store:ty, $bits:expr, $one:expr, $polynomial:expr, $deser_type: tt, $({$($extra:item)*})? ) => {
         #[allow(clippy::suspicious_arithmetic_impl)]
@@ -169,7 +175,17 @@ macro_rules! bit_array_impl {
                 const ZERO: Self = Self(<$store>::ZERO);
             }
 
+            impl Vectorizable<1> for $name {
+                type Array = crate::secret_sharing::StdArray<$name, 1>;
+            }
+
+            impl FieldVectorizable<1> for $name {
+                type ArrayAlias = crate::secret_sharing::StdArray<$name, 1>;
+            }
+
             impl Field for $name {
+                const NAME: &'static str = stringify!($field);
+
                 const ONE: Self = Self($one);
 
                 fn as_u128(&self) -> u128 {
@@ -693,5 +709,25 @@ bit_array_impl!(
                 value != Gf2::ZERO
             }
         }
+
+        impl From<crate::ff::boolean::Boolean> for Gf2 {
+            fn from(value: crate::ff::boolean::Boolean) -> Self {
+                bool::from(value).into()
+            }
+        }
+
+        impl From<Gf2> for crate::ff::boolean::Boolean {
+            fn from(value: Gf2) -> Self {
+                bool::from(value).into()
+            }
+        }
+
+        impl std::ops::Not for Gf2 {
+            type Output = Self;
+
+            fn not(self) -> Self {
+                (!bool::from(self)).into()
+            }
+        }
     }
 );
diff --git a/ipa-core/src/ff/mod.rs b/ipa-core/src/ff/mod.rs
index 96aecca00..b831fd707 100644
--- a/ipa-core/src/ff/mod.rs
+++ b/ipa-core/src/ff/mod.rs
@@ -82,7 +82,7 @@ pub trait Serializable: Sized {
 
 pub trait ArrayAccess {
     type Output;
-    type Iter<'a>: Iterator<Item = Self::Output> + Send
+    type Iter<'a>: Iterator<Item = Self::Output> + ExactSizeIterator + Send
     where
         Self: 'a;
 
diff --git a/ipa-core/src/ff/prime_field.rs b/ipa-core/src/ff/prime_field.rs
index b7ac911b9..2f9694969 100644
--- a/ipa-core/src/ff/prime_field.rs
+++ b/ipa-core/src/ff/prime_field.rs
@@ -6,7 +6,7 @@ use super::Field;
 use crate::{
     ff::Serializable,
     protocol::prss::FromRandomU128,
-    secret_sharing::{Block, SharedValue},
+    secret_sharing::{Block, FieldVectorizable, SharedValue, StdArray, Vectorizable},
 };
 
 pub trait PrimeField: Field {
@@ -33,7 +33,17 @@ macro_rules! field_impl {
             const ZERO: Self = $field(0);
         }
 
+        impl Vectorizable<1> for $field {
+            type Array = StdArray<$field, 1>;
+        }
+
+        impl FieldVectorizable<1> for $field {
+            type ArrayAlias = StdArray<$field, 1>;
+        }
+
         impl Field for $field {
+            const NAME: &'static str = stringify!($field);
+
             const ONE: Self = $field(1);
 
             fn as_u128(&self) -> u128 {
@@ -317,6 +327,14 @@ mod fp31 {
 mod fp32bit {
     field_impl! { Fp32BitPrime, u32, 32, 4_294_967_291 }
 
+    impl Vectorizable<32> for Fp32BitPrime {
+        type Array = StdArray<Fp32BitPrime, 32>;
+    }
+
+    impl FieldVectorizable<32> for Fp32BitPrime {
+        type ArrayAlias = StdArray<Fp32BitPrime, 32>;
+    }
+
     #[cfg(all(test, unit_test))]
     mod specialized_tests {
         use super::*;
diff --git a/ipa-core/src/helpers/buffers/ordering_sender.rs b/ipa-core/src/helpers/buffers/ordering_sender.rs
index 943ee07f5..d7e383036 100644
--- a/ipa-core/src/helpers/buffers/ordering_sender.rs
+++ b/ipa-core/src/helpers/buffers/ordering_sender.rs
@@ -3,6 +3,7 @@ use std::{
     cmp::Ordering,
     collections::VecDeque,
     fmt::Debug,
+    marker::PhantomData,
     mem::drop,
     num::NonZeroUsize,
     pin::Pin,
@@ -330,8 +331,13 @@ impl OrderingSender {
     /// * the same index is provided more than once.
     ///
     /// [capacity]: OrderingSender#spare-capacity-configuration
-    pub fn send<M: Message>(&self, i: usize, m: M) -> Send<'_, M> {
-        Send { i, m, sender: self }
+    pub fn send<M: Message, B: Borrow<M>>(&self, i: usize, m: B) -> Send<'_, M, B> {
+        Send {
+            i,
+            m,
+            sender: self,
+            phantom_data: PhantomData,
+        }
     }
 
     /// Close the sender at index `i`.
@@ -433,13 +439,14 @@ impl OrderingSender {
 }
 
 /// A future for writing item `i` into an `OrderingSender`.
-pub struct Send<'s, M: Message> {
+pub struct Send<'a, M: Message, B: Borrow<M> + 'a> {
     i: usize,
-    m: M,
-    sender: &'s OrderingSender,
+    m: B,
+    sender: &'a OrderingSender,
+    phantom_data: PhantomData<M>,
 }
 
-impl<'s, M: Message> Future for Send<'s, M> {
+impl<'a, M: Message, B: Borrow<M> + 'a> Future for Send<'a, M, B> {
     type Output = ();
 
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
@@ -447,7 +454,7 @@ impl<'s, M: Message> Future for Send<'s, M> {
 
         let res = this.sender.next_op(this.i, cx, |b| {
             assert!(!b.closed, "writing on a closed stream");
-            b.write(&this.m, cx)
+            b.write(this.m.borrow(), cx)
         });
         // A successful write: wake the next in line.
         // But not while holding the lock on state.
diff --git a/ipa-core/src/helpers/gateway/send.rs b/ipa-core/src/helpers/gateway/send.rs
index bdc51a4da..00d8de096 100644
--- a/ipa-core/src/helpers/gateway/send.rs
+++ b/ipa-core/src/helpers/gateway/send.rs
@@ -1,4 +1,5 @@
 use std::{
+    borrow::Borrow,
     marker::PhantomData,
     num::NonZeroUsize,
     pin::Pin,
@@ -52,7 +53,11 @@ impl GatewaySender {
         }
     }
 
-    pub async fn send<M: Message>(&self, record_id: RecordId, msg: M) -> Result<(), Error> {
+    pub async fn send<M: Message, B: Borrow<M>>(
+        &self,
+        record_id: RecordId,
+        msg: B,
+    ) -> Result<(), Error> {
         debug_assert!(
             self.total_records.is_specified(),
             "total_records cannot be unspecified when sending"
@@ -109,7 +114,7 @@ impl<M: Message> SendingEnd<M> {
     ///
     /// [`set_total_records`]: crate::protocol::context::Context::set_total_records
     #[tracing::instrument(level = "trace", "send", skip_all, fields(i = %record_id, total = %self.inner.total_records, to = ?self.channel_id.role, gate = ?self.channel_id.gate.as_ref()))]
-    pub async fn send(&self, record_id: RecordId, msg: M) -> Result<(), Error> {
+    pub async fn send<B: Borrow<M>>(&self, record_id: RecordId, msg: B) -> Result<(), Error> {
         let r = self.inner.send(record_id, msg).await;
         metrics::increment_counter!(RECORDS_SENT,
             STEP => self.channel_id.gate.as_ref().to_string(),
diff --git a/ipa-core/src/helpers/gateway/stall_detection.rs b/ipa-core/src/helpers/gateway/stall_detection.rs
index c2e288572..9a1b28732 100644
--- a/ipa-core/src/helpers/gateway/stall_detection.rs
+++ b/ipa-core/src/helpers/gateway/stall_detection.rs
@@ -270,6 +270,7 @@ mod receive {
 
 mod send {
     use std::{
+        borrow::Borrow,
         collections::BTreeMap,
         fmt::{Debug, Formatter},
     };
@@ -288,7 +289,7 @@ mod send {
         delegate::delegate! {
             to { self.advance(); self.inner() } {
                 #[inline]
-                pub async fn send(&self, record_id: RecordId, msg: M) -> Result<(), Error>;
+                pub async fn send<B: Borrow<M>>(&self, record_id: RecordId, msg: B) -> Result<(), Error>;
             }
         }
     }
diff --git a/ipa-core/src/protocol/basics/mul/mod.rs b/ipa-core/src/protocol/basics/mul/mod.rs
index ed98e9d0b..b8924343a 100644
--- a/ipa-core/src/protocol/basics/mul/mod.rs
+++ b/ipa-core/src/protocol/basics/mul/mod.rs
@@ -7,9 +7,12 @@ use crate::{
         context::{Context, UpgradedMaliciousContext},
         RecordId,
     },
-    secret_sharing::replicated::{
-        malicious::{AdditiveShare as MaliciousReplicated, ExtendableField},
-        semi_honest::AdditiveShare as Replicated,
+    secret_sharing::{
+        replicated::{
+            malicious::{AdditiveShare as MaliciousReplicated, ExtendableField},
+            semi_honest::AdditiveShare as Replicated,
+        },
+        FieldSimd,
     },
 };
 
@@ -52,7 +55,11 @@ use {malicious::multiply as malicious_mul, semi_honest::multiply as semi_honest_
 
 /// Implement secure multiplication for semi-honest contexts with replicated secret sharing.
 #[async_trait]
-impl<C: Context, F: Field> SecureMul<C> for Replicated<F> {
+impl<C, F, const N: usize> SecureMul<C> for Replicated<F, N>
+where
+    C: Context,
+    F: Field + FieldSimd<N>,
+{
     async fn multiply_sparse<'fut>(
         &self,
         rhs: &Self,
diff --git a/ipa-core/src/protocol/basics/mul/semi_honest.rs b/ipa-core/src/protocol/basics/mul/semi_honest.rs
index 25de86946..67171ff25 100644
--- a/ipa-core/src/protocol/basics/mul/semi_honest.rs
+++ b/ipa-core/src/protocol/basics/mul/semi_honest.rs
@@ -8,8 +8,9 @@ use crate::{
         prss::SharedRandomness,
         RecordId,
     },
-    secret_sharing::replicated::{
-        semi_honest::AdditiveShare as Replicated, ReplicatedSecretSharing,
+    secret_sharing::{
+        replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, SharedValueArray,
+        Vectorizable,
     },
 };
 
@@ -26,16 +27,16 @@ use crate::{
 /// ## Errors
 /// Lots of things may go wrong here, from timeouts to bad output. They will be signalled
 /// back via the error response
-pub async fn multiply<C, F>(
+pub async fn multiply<C, F, const N: usize>(
     ctx: C,
     record_id: RecordId,
-    a: &Replicated<F>,
-    b: &Replicated<F>,
+    a: &Replicated<F, N>,
+    b: &Replicated<F, N>,
     zeros: MultiplyZeroPositions,
-) -> Result<Replicated<F>, Error>
+) -> Result<Replicated<F, N>, Error>
 where
     C: Context,
-    F: Field,
+    F: Field + FieldSimd<N>,
 {
     let role = ctx.role();
     let [need_to_recv, need_to_send, need_random_right] = zeros.work_for(role);
@@ -43,19 +44,26 @@ where
     zeros.1.check(role, "b", b);
 
     // Shared randomness used to mask the values that are sent.
-    let (s0, s1) = ctx.prss().generate(record_id);
+    let (s0, s1) = ctx
+        .prss()
+        .generate::<(<F as Vectorizable<N>>::Array, _), _>(record_id);
+
+    let mut rhs = a.right_arr().clone() * b.right_arr();
 
-    let mut rhs = a.right() * b.right();
     if need_to_send {
         // Compute the value (d_i) we want to send to the right helper (i+1).
-        let right_d = a.left() * b.right() + a.right() * b.left() - s0;
+        let right_d =
+            a.left_arr().clone() * b.right_arr() + a.right_arr().clone() * b.left_arr() - &s0;
 
-        ctx.send_channel(role.peer(Direction::Right))
-            .send(record_id, right_d)
+        ctx.send_channel::<<F as Vectorizable<N>>::Array>(role.peer(Direction::Right))
+            .send(record_id, &right_d)
             .await?;
         rhs += right_d;
     } else {
-        debug_assert_eq!(a.left() * b.right() + a.right() * b.left(), F::ZERO);
+        debug_assert_eq!(
+            a.left_arr().clone() * b.right_arr() + a.right_arr().clone() * b.left_arr(),
+            <<F as Vectorizable<N>>::Array as SharedValueArray<F>>::ZERO
+        );
     }
     // Add randomness to this value whether we sent or not, depending on whether the
     // peer to the right needed to send.  If they send, they subtract randomness,
@@ -65,9 +73,9 @@ where
     }
 
     // Sleep until helper on the left sends us their (d_i-1) value.
-    let mut lhs = a.left() * b.left();
+    let mut lhs = a.left_arr().clone() * b.left_arr();
     if need_to_recv {
-        let left_d = ctx
+        let left_d: <F as Vectorizable<N>>::Array = ctx
             .recv_channel(role.peer(Direction::Left))
             .receive(record_id)
             .await?;
@@ -78,21 +86,32 @@ where
         lhs += s0;
     }
 
-    Ok(Replicated::new(lhs, rhs))
+    Ok(Replicated::new_arr(lhs, rhs))
 }
 
 #[cfg(all(test, unit_test))]
 mod test {
-    use std::iter::{repeat, zip};
+    use std::{
+        array,
+        iter::{repeat, zip},
+        time::Instant,
+    };
 
     use rand::distributions::{Distribution, Standard};
 
+    use super::multiply;
     use crate::{
-        ff::{Field, Fp31},
-        protocol::{basics::SecureMul, context::Context, RecordId},
+        ff::{Field, Fp31, Fp32BitPrime},
+        helpers::TotalRecords,
+        protocol::{
+            basics::{SecureMul, ZeroPositions},
+            context::Context,
+            RecordId,
+        },
         rand::{thread_rng, Rng},
+        secret_sharing::replicated::semi_honest::AdditiveShare,
         seq_join::SeqJoin,
-        test_fixture::{Reconstruct, Runner, TestWorld},
+        test_fixture::{Reconstruct, ReconstructArr, Runner, TestWorld},
     };
 
     #[tokio::test]
@@ -182,4 +201,145 @@ mod test {
 
         result.reconstruct().as_u128()
     }
+
+    #[tokio::test]
+    pub async fn wide_mul() {
+        const COUNT: usize = 32;
+        let world = TestWorld::default();
+
+        let mut rng = thread_rng();
+        let a: [Fp32BitPrime; COUNT] = (0..COUNT)
+            .map(|_| rng.gen::<Fp32BitPrime>())
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        let b: [Fp32BitPrime; COUNT] = (0..COUNT)
+            .map(|_| rng.gen::<Fp32BitPrime>())
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        let expected: [Fp32BitPrime; COUNT] = zip(a.iter(), b.iter())
+            .map(|(&a, &b)| a * b)
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        let results = world
+            .semi_honest((a, b), |ctx, (a_shares, b_shares)| async move {
+                multiply(
+                    ctx.set_total_records(1),
+                    RecordId::from(0),
+                    &a_shares,
+                    &b_shares,
+                    ZeroPositions::NONE,
+                )
+                .await
+                .unwrap()
+            })
+            .await;
+        assert_eq!(expected, results.reconstruct_arr());
+    }
+
+    // The manymult test is a microbenchmark. The test generates a DxW matrix of field elements. The
+    // matrix is reduced to a single W-element row vector by taking the element-wise product of the
+    // D values in each column. The non-vectorized implementation (manymult_novec) simply does a
+    // parallel_join of W semi-honest multiplies. The vectorized implementation (manymult_vec)
+    // processes a row at a time. For manymult_vec, MANYMULT_WIDTH must match a supported
+    // vectorization width.
+    const MANYMULT_ITERS: usize = 512;
+    const MANYMULT_WIDTH: usize = 32;
+
+    #[tokio::test]
+    pub async fn manymult_novec() {
+        let world = TestWorld::default();
+        let mut rng = thread_rng();
+        let mut inputs = Vec::<Vec<Fp32BitPrime>>::new();
+        for _ in 0..MANYMULT_ITERS {
+            inputs.push(
+                (0..MANYMULT_WIDTH)
+                    .map(|_| Fp32BitPrime::try_from(u128::from(rng.gen_range(0u32..100))).unwrap())
+                    .collect::<Vec<_>>(),
+            );
+        }
+        let expected = inputs
+            .iter()
+            .fold(None, |acc: Option<Vec<Fp32BitPrime>>, b| match acc {
+                Some(a) => Some(a.iter().zip(b.iter()).map(|(&a, &b)| a * b).collect()),
+                None => Some(b.clone()),
+            })
+            .unwrap();
+
+        let begin = Instant::now();
+        let result = world
+            .semi_honest(
+                inputs.into_iter().map(IntoIterator::into_iter),
+                |ctx, share: Vec<Vec<AdditiveShare<Fp32BitPrime>>>| async move {
+                    let ctx = ctx.set_total_records(MANYMULT_ITERS * MANYMULT_WIDTH);
+                    let mut iter = share.iter();
+                    let mut val = iter.next().unwrap().clone();
+                    for i in 1..MANYMULT_ITERS {
+                        let cur = iter.next().unwrap();
+                        let mut res = Vec::with_capacity(MANYMULT_WIDTH);
+                        for j in 0..MANYMULT_WIDTH {
+                            res.push(val[j].multiply(
+                                &cur[j],
+                                ctx.clone(),
+                                RecordId::from(MANYMULT_WIDTH * (i - 1) + j),
+                            ));
+                        }
+                        val = ctx.parallel_join(res).await.unwrap();
+                    }
+                    val
+                },
+            )
+            .await;
+        tracing::debug!("Protocol execution time: {:?}", begin.elapsed());
+        assert_eq!(expected, result.reconstruct());
+    }
+
+    #[tokio::test]
+    pub async fn manymult_vec() {
+        let world = TestWorld::default();
+        let mut rng = thread_rng();
+        let mut inputs = Vec::<[Fp32BitPrime; MANYMULT_WIDTH]>::new();
+        for _ in 0..MANYMULT_ITERS {
+            inputs.push(array::from_fn(|_| rng.gen()));
+        }
+        let expected = inputs
+            .iter()
+            .fold(None, |acc: Option<Vec<Fp32BitPrime>>, b| match acc {
+                Some(a) => Some(a.iter().zip(b.iter()).map(|(&a, &b)| a * b).collect()),
+                None => Some(b.to_vec()),
+            })
+            .unwrap();
+
+        let begin = Instant::now();
+        let result = world
+            .semi_honest(
+                inputs.into_iter(),
+                |ctx, share: Vec<AdditiveShare<Fp32BitPrime, MANYMULT_WIDTH>>| async move {
+                    // The output of each row is input to the next row, so no parallelization
+                    // across rows is possible. Thus we set TotalRecords::Indeterminate, which
+                    // flushes after every record. If a row were larger than one record, we could
+                    // instead configure the active work in TestWorld to match the row size.
+                    let ctx = ctx.set_total_records(TotalRecords::Indeterminate);
+                    let mut iter = share.iter();
+                    let mut val = iter.next().unwrap().clone();
+                    for i in 1..MANYMULT_ITERS {
+                        val = multiply(
+                            ctx.clone(),
+                            RecordId::from(i - 1),
+                            &val,
+                            iter.next().unwrap(),
+                            ZeroPositions::NONE,
+                        )
+                        .await
+                        .unwrap();
+                    }
+                    val
+                },
+            )
+            .await;
+        tracing::debug!("Protocol execution time: {:?}", begin.elapsed());
+        assert_eq!(expected, result.reconstruct_arr());
+    }
 }
diff --git a/ipa-core/src/protocol/basics/mul/sparse.rs b/ipa-core/src/protocol/basics/mul/sparse.rs
index 9f1ad9943..878199b23 100644
--- a/ipa-core/src/protocol/basics/mul/sparse.rs
+++ b/ipa-core/src/protocol/basics/mul/sparse.rs
@@ -1,5 +1,8 @@
+#[cfg_attr(not(debug_assertions), allow(unused_variables))]
+use crate::secret_sharing::Vectorizable;
 use crate::{
-    ff::Field, helpers::Role, secret_sharing::replicated::semi_honest::AdditiveShare as Replicated,
+    helpers::Role,
+    secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, SharedValue},
 };
 
 /// A description of a replicated secret sharing, with zero values at known positions.
@@ -105,25 +108,28 @@ impl ZeroPositions {
     /// # Panics
     /// When the input value includes a non-zero value in a position marked as having a zero.
     #[cfg_attr(not(debug_assertions), allow(unused_variables))]
-    pub fn check<F: Field>(self, role: Role, which: &str, v: &Replicated<F>) {
+    pub fn check<V: SharedValue + Vectorizable<N>, const N: usize>(
+        self,
+        role: Role,
+        which: &str,
+        v: &Replicated<V, N>,
+    ) {
         #[cfg(debug_assertions)]
         {
-            use crate::{
-                helpers::Direction::Right, secret_sharing::replicated::ReplicatedSecretSharing,
-            };
+            use crate::{helpers::Direction::Right, secret_sharing::SharedValueArray};
 
             let flags = <[bool; 3]>::from(self);
             if flags[role as usize] {
                 assert_eq!(
-                    F::ZERO,
-                    v.left(),
+                    &<V as Vectorizable<N>>::Array::ZERO,
+                    v.left_arr(),
                     "expected a zero on the left for input {which}"
                 );
             }
             if flags[role.peer(Right) as usize] {
                 assert_eq!(
-                    F::ZERO,
-                    v.right(),
+                    &<V as Vectorizable<N>>::Array::ZERO,
+                    v.right_arr(),
                     "expected a zero on the right for input {which}"
                 );
             }
diff --git a/ipa-core/src/protocol/modulus_conversion/convert_shares.rs b/ipa-core/src/protocol/modulus_conversion/convert_shares.rs
index ce2a6a369..08a8dac75 100644
--- a/ipa-core/src/protocol/modulus_conversion/convert_shares.rs
+++ b/ipa-core/src/protocol/modulus_conversion/convert_shares.rs
@@ -70,7 +70,7 @@ impl<F: PrimeField> BitConversionTriple<Replicated<F>> {
     ///
     /// # Panics
     /// If any bits in the bitwise shared input cannot be converted into the given field `F`
-    /// without truncation or if the bit index is out of range for `B`.
+    /// without truncation.
     #[must_use]
     pub fn new(helper_role: Role, left: bool, right: bool) -> Self {
         let left = F::try_from(u128::from(left)).unwrap();
diff --git a/ipa-core/src/secret_sharing/array.rs b/ipa-core/src/secret_sharing/array.rs
new file mode 100644
index 000000000..f05cc30dc
--- /dev/null
+++ b/ipa-core/src/secret_sharing/array.rs
@@ -0,0 +1,315 @@
+use std::{
+    array,
+    borrow::Borrow,
+    fmt::Debug,
+    ops::{Add, AddAssign, Mul, Neg, Not, Sub, SubAssign},
+};
+
+use generic_array::{ArrayLength, GenericArray};
+use typenum::{U1, U32};
+
+use crate::{
+    ff::{Field, Fp32BitPrime, Serializable},
+    helpers::Message,
+    protocol::prss::{FromRandom, FromRandomU128},
+    secret_sharing::{FieldArray, SharedValue, SharedValueArray},
+};
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct StdArray<V: SharedValue, const N: usize>([V; N]);
+
+impl<V, T, const N: usize> PartialEq<T> for StdArray<V, N>
+where
+    V: SharedValue,
+    T: Borrow<[V]>,
+{
+    fn eq(&self, other: &T) -> bool {
+        self.0.as_slice() == other.borrow()
+    }
+}
+
+impl<V: SharedValue, const N: usize> PartialEq<StdArray<V, N>> for Vec<V> {
+    fn eq(&self, other: &StdArray<V, N>) -> bool {
+        other.eq(self)
+    }
+}
+
+impl<V: SharedValue, const N: usize> PartialEq<StdArray<V, N>> for [V; N] {
+    fn eq(&self, other: &StdArray<V, N>) -> bool {
+        other.eq(self)
+    }
+}
+
+impl<V: SharedValue, const N: usize> SharedValueArray<V> for StdArray<V, N>
+where
+    Self: Serializable,
+{
+    const ZERO: Self = Self([V::ZERO; N]);
+
+    fn from_fn<F: FnMut(usize) -> V>(f: F) -> Self {
+        Self(array::from_fn(f))
+    }
+
+    fn get(&self, index: usize) -> V {
+        self.0[index]
+    }
+
+    fn get_mut(&mut self, index: usize) -> &mut V {
+        &mut self.0[index]
+    }
+
+    fn set(&mut self, index: usize, value: V) {
+        self.0[index] = value;
+    }
+}
+
+impl<F: Field, const N: usize> FieldArray<F> for StdArray<F, N> where Self: FromRandom + Serializable
+{}
+
+impl<V: SharedValue, const N: usize> TryFrom<Vec<V>> for StdArray<V, N> {
+    type Error = ();
+    fn try_from(value: Vec<V>) -> Result<Self, Self::Error> {
+        value.try_into().map(Self).map_err(|_| ())
+    }
+}
+
+// Panics if the iterator terminates before producing N items.
+impl<V: SharedValue, const N: usize> FromIterator<V> for StdArray<V, N>
+where
+    Self: Serializable,
+{
+    fn from_iter<T: IntoIterator<Item = V>>(iter: T) -> Self {
+        let mut res = Self::ZERO;
+        let mut iter = iter.into_iter();
+
+        for i in 0..N {
+            res.0[i] = iter.next().unwrap();
+        }
+
+        res
+    }
+}
+
+impl<V: SharedValue, const N: usize> IntoIterator for StdArray<V, N> {
+    type Item = V;
+    type IntoIter = std::array::IntoIter<V, N>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+impl<'a, 'b, V: SharedValue, const N: usize> Add<&'b StdArray<V, N>> for &'a StdArray<V, N> {
+    type Output = StdArray<V, N>;
+
+    fn add(self, rhs: &'b StdArray<V, N>) -> Self::Output {
+        StdArray(array::from_fn(|i| self.0[i] + rhs.0[i]))
+    }
+}
+
+impl<V: SharedValue, const N: usize> Add<Self> for StdArray<V, N> {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        Add::add(&self, &rhs)
+    }
+}
+
+// add(owned, ref) should be preferred over this.
+impl<V: SharedValue, const N: usize> Add<StdArray<V, N>> for &StdArray<V, N> {
+    type Output = StdArray<V, N>;
+
+    fn add(self, rhs: StdArray<V, N>) -> Self::Output {
+        Add::add(self, &rhs)
+    }
+}
+
+impl<V: SharedValue, const N: usize> Add<&StdArray<V, N>> for StdArray<V, N> {
+    type Output = Self;
+
+    fn add(self, rhs: &Self) -> Self::Output {
+        Add::add(&self, rhs)
+    }
+}
+
+impl<V: SharedValue, const N: usize> AddAssign<&Self> for StdArray<V, N> {
+    fn add_assign(&mut self, rhs: &Self) {
+        for (a, b) in self.0.iter_mut().zip(rhs.0.iter()) {
+            *a += *b;
+        }
+    }
+}
+
+impl<V: SharedValue, const N: usize> AddAssign<Self> for StdArray<V, N> {
+    fn add_assign(&mut self, rhs: Self) {
+        AddAssign::add_assign(self, &rhs);
+    }
+}
+
+impl<V: SharedValue, const N: usize> Neg for &StdArray<V, N> {
+    type Output = StdArray<V, N>;
+
+    fn neg(self) -> Self::Output {
+        StdArray(array::from_fn(|i| -self.0[i]))
+    }
+}
+
+impl<V: SharedValue, const N: usize> Neg for StdArray<V, N> {
+    type Output = Self;
+
+    fn neg(self) -> Self::Output {
+        Neg::neg(&self)
+    }
+}
+
+impl<V: SharedValue, const N: usize> Sub<Self> for &StdArray<V, N> {
+    type Output = StdArray<V, N>;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        StdArray(array::from_fn(|i| self.0[i] - rhs.0[i]))
+    }
+}
+
+impl<V: SharedValue, const N: usize> Sub<Self> for StdArray<V, N> {
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        Sub::sub(&self, &rhs)
+    }
+}
+
+impl<V: SharedValue, const N: usize> Sub<&Self> for StdArray<V, N> {
+    type Output = Self;
+
+    fn sub(self, rhs: &Self) -> Self::Output {
+        Sub::sub(&self, rhs)
+    }
+}
+
+impl<V: SharedValue, const N: usize> Sub<StdArray<V, N>> for &StdArray<V, N> {
+    type Output = StdArray<V, N>;
+
+    fn sub(self, rhs: StdArray<V, N>) -> Self::Output {
+        Sub::sub(self, &rhs)
+    }
+}
+
+impl<V: SharedValue, const N: usize> SubAssign<&Self> for StdArray<V, N> {
+    fn sub_assign(&mut self, rhs: &Self) {
+        for (a, b) in self.0.iter_mut().zip(rhs.0.iter()) {
+            *a -= *b;
+        }
+    }
+}
+
+impl<V: SharedValue, const N: usize> SubAssign<Self> for StdArray<V, N> {
+    fn sub_assign(&mut self, rhs: Self) {
+        SubAssign::sub_assign(self, &rhs);
+    }
+}
+
+impl<'a, 'b, F: Field, const N: usize> Mul<&'b F> for &'a StdArray<F, N> {
+    type Output = StdArray<F, N>;
+
+    fn mul(self, rhs: &'b F) -> Self::Output {
+        StdArray(array::from_fn(|i| self.0[i] * *rhs))
+    }
+}
+
+impl<F: Field, const N: usize> Mul<F> for StdArray<F, N> {
+    type Output = Self;
+
+    fn mul(self, rhs: F) -> Self::Output {
+        Mul::mul(&self, &rhs)
+    }
+}
+
+impl<F: Field, const N: usize> Mul<&F> for StdArray<F, N> {
+    type Output = Self;
+
+    fn mul(self, rhs: &F) -> Self::Output {
+        Mul::mul(&self, rhs)
+    }
+}
+
+impl<F: Field, const N: usize> Mul<F> for &StdArray<F, N> {
+    type Output = StdArray<F, N>;
+
+    fn mul(self, rhs: F) -> Self::Output {
+        Mul::mul(self, &rhs)
+    }
+}
+
+impl<'a, F: Field, const N: usize> Mul<&'a StdArray<F, N>> for StdArray<F, N> {
+    type Output = StdArray<F, N>;
+
+    fn mul(self, rhs: &'a StdArray<F, N>) -> Self::Output {
+        StdArray(array::from_fn(|i| self.0[i] * rhs.0[i]))
+    }
+}
+
+impl<V: SharedValue + Not<Output = V>, const N: usize> Not for StdArray<V, N> {
+    type Output = StdArray<V, N>;
+
+    fn not(self) -> Self::Output {
+        StdArray(array::from_fn(|i| !self.0[i]))
+    }
+}
+
+impl<F: SharedValue + FromRandom<SourceLength = U1>> FromRandom for StdArray<F, 1> {
+    type SourceLength = U1;
+    fn from_random(src: GenericArray<u128, U1>) -> Self {
+        Self([F::from_random(src)])
+    }
+}
+
+impl FromRandom for StdArray<Fp32BitPrime, 32> {
+    type SourceLength = U32;
+
+    fn from_random(src: GenericArray<u128, U32>) -> Self {
+        Self(array::from_fn(|i| Fp32BitPrime::from_random_u128(src[i])))
+    }
+}
+
+impl<V: SharedValue> Serializable for StdArray<V, 1> {
+    type Size = <V as Serializable>::Size;
+    type DeserializationError = <V as Serializable>::DeserializationError;
+
+    fn serialize(&self, buf: &mut GenericArray<u8, Self::Size>) {
+        self.0[0].serialize(buf);
+    }
+
+    fn deserialize(buf: &GenericArray<u8, Self::Size>) -> Result<Self, Self::DeserializationError> {
+        Ok(StdArray([V::deserialize(buf)?]))
+    }
+}
+
+impl<V: SharedValue> Serializable for StdArray<V, 32>
+where
+    V: SharedValue,
+    <V as Serializable>::Size: Mul<U32>,
+    <<V as Serializable>::Size as Mul<U32>>::Output: ArrayLength,
+{
+    type Size = <<V as Serializable>::Size as Mul<U32>>::Output;
+    type DeserializationError = <V as Serializable>::DeserializationError;
+
+    fn serialize(&self, buf: &mut GenericArray<u8, Self::Size>) {
+        let sz: usize = (<V as SharedValue>::BITS / 8).try_into().unwrap();
+        for i in 0..32 {
+            self.0[i].serialize(
+                GenericArray::try_from_mut_slice(&mut buf[sz * i..sz * (i + 1)]).unwrap(),
+            );
+        }
+    }
+
+    fn deserialize(buf: &GenericArray<u8, Self::Size>) -> Result<Self, Self::DeserializationError> {
+        let sz: usize = (<V as SharedValue>::BITS / 8).try_into().unwrap();
+        let mut res = [V::ZERO; 32];
+        for i in 0..32 {
+            res[i] = V::deserialize(GenericArray::from_slice(&buf[sz * i..sz * (i + 1)]))?;
+        }
+        Ok(StdArray(res))
+    }
+}
+
+impl<V: SharedValue, const N: usize> Message for StdArray<V, N> where Self: Serializable {}
diff --git a/ipa-core/src/secret_sharing/mod.rs b/ipa-core/src/secret_sharing/mod.rs
index 42a62ca55..3b4923854 100644
--- a/ipa-core/src/secret_sharing/mod.rs
+++ b/ipa-core/src/secret_sharing/mod.rs
@@ -1,14 +1,62 @@
+//! # Vectorization
+//!
+//! Vectorization refers to adapting an implementation that previously operated on one value at a
+//! time, to instead operate on `N` values at a time. Vectorization improves performance in two ways:
+//!
+//!  1. Vectorized code can make use of special CPU instructions (Intel AVX, ARM NEON) that operate
+//!     on multiple values at a time. This reduces the CPU time required to perform computations.
+//!     We also use vectorization to refer to "bit packing" of boolean values, i.e., packing
+//!     64 boolean values into a single u64 rather than using a byte (or even a word) for each
+//!     value.
+//!  2. Aside from the core arithmetic operations that are involved in our MPC, a substantial
+//!     amount of other code is needed to send values between helpers, schedule futures for
+//!     execution, etc. Vectorization can result in a greater amount of arithmetic work being
+//!     performed for a given amount of overhead work, thus increasing the efficiency of the
+//!     implementation.
+//!
+//! ## Vectorization traits
+//!
+//! There are two sets of traits related to vectorization.
+//!
+//! If you are writing protocols, the trait of interest is `FieldSimd<N>`, which can be specified in
+//! a trait bound, something like `F: Field + FieldSimd<N>`.
+//!
+//! The other traits are `Vectorizable` (for `SharedValue`s) and `FieldVectorizable`. These traits
+//! are needed to work around a limitation in the rust type system. See the `FieldVectorizable`
+//! documentation for details.
+//!
+//! We require that each supported vectorization configuration (i.e. combination of data type and
+//! width) be explicitly identified, by implementing the `Vectorizable` and/or `FieldVectorizable`
+//! traits for base data type (e.g. `Fp32BitPrime`). This is for two reasons:
+//!  1. Rust doesn't yet support evaluating expressions involving const parameters at compile time,
+//!     which makes it difficult or impossible to write generic serialization routines for
+//!     arbitrary widths.
+//!  2. As a measure of protection against inadvertently using a configuration that will not be
+//!     efficient (i.e. an excessive vector width).
+//!
+//! ## Adding a new supported vectorization
+//!
+//! To add a new supported vectorization:
+//!
+//!  1. Add `FieldSimd` impl (in `secret_sharing/mod.rs`)
+//!  2. Add `FromRandom` impl (in `array.rs` or `boolean_array.rs`)
+//!  3. Add `Serializable` impl (in `array.rs` or `boolean_array.rs`)
+//!  4. Add `Vectorizable` and `FieldVectorizable` impls (with the primitive type def in e.g. `galois_field.rs`
+
 pub mod replicated;
 
+mod array;
 mod decomposed;
 mod into_shares;
 mod scheme;
 
 use std::{
     fmt::Debug,
-    ops::{Mul, MulAssign, Neg},
+    iter::once,
+    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
 };
 
+pub use array::StdArray;
 pub use decomposed::BitDecomposed;
 use generic_array::ArrayLength;
 pub use into_shares::IntoShares;
@@ -21,7 +69,11 @@ use rand::{
 use replicated::{semi_honest::AdditiveShare, ReplicatedSecretSharing};
 pub use scheme::{Bitwise, Linear, LinearRefOps, SecretSharing};
 
-use crate::ff::{AddSub, AddSubAssign, Serializable};
+use crate::{
+    ff::{AddSub, AddSubAssign, Field, Fp32BitPrime, Serializable},
+    helpers::Message,
+    protocol::prss::FromRandom,
+};
 
 /// Operations supported for weak shared values.
 pub trait Additive<Rhs = Self, Output = Self>:
@@ -57,13 +109,140 @@ pub trait Block: Sized + Copy + Debug {
 /// (capable of supporting addition and multiplication) is desired, the `Field` trait extends
 /// `SharedValue` to require multiplication.
 pub trait SharedValue:
-    Clone + Copy + Eq + Debug + Send + Sync + Sized + Additive + Serializable + 'static
+    Clone
+    + Copy
+    + Eq
+    + Debug
+    + Send
+    + Sync
+    + Sized
+    + Additive
+    + Serializable
+    + Vectorizable<1>
+    + 'static
 {
     type Storage: Block;
 
     const BITS: u32;
 
     const ZERO: Self;
+
+    // Note the trait bound of `Vectorizable<1>` here, i.e., these
+    // helpers only apply to arrays of a single element.
+    fn into_array<A>(self) -> A
+    where
+        Self: Vectorizable<1, Array = A>,
+        A: SharedValueArray<Self>,
+    {
+        once(self).collect::<A>()
+    }
+
+    fn from_array<A>(array: &A) -> Self
+    where
+        Self: Vectorizable<1, Array = A>,
+        A: SharedValueArray<Self>,
+    {
+        array.get(0)
+    }
+
+    fn from_array_mut<A>(array: &mut A) -> &mut Self
+    where
+        Self: Vectorizable<1, Array = A>,
+        A: SharedValueArray<Self>,
+    {
+        array.get_mut(0)
+    }
+}
+
+// Note that we can either make `trait Vectorizable<N>: SharedValue`, or we can make `trait
+// SharedValue: Vectorizable<1>`, but doing both creates a cycle. (Similarly for
+// `FieldVectorizable` / `Field`.)
+//
+// Although it is somewhat unnatural, we choose to do the latter, because it allows existing
+// high-level protocols unaware of vectorization to call vectorized versions of core protocols (with
+// width of 1) without updating all of the trait bounds. This does mean that the trait definitions
+// do not prevent implementing `Vectorizable` for something that is not a `SharedValue`, but please
+// don't do that.
+
+/// Trait for `SharedValue`s supporting operations on `N`-wide vectors.
+pub trait Vectorizable<const N: usize>: Sized {
+    type Array: SharedValueArray<Self>;
+}
+
+/// Trait for `Field`s supporting operations on `N`-wide vectors.
+///
+/// We would like `F` to be `FieldVectorizable` if it satisfies all of the following:
+///  1. `F: Field`.
+///  2. `<F as Vectorizable<N>>::Array: FieldArray<Self>`. Rust does not support expressing a
+///     constraint on a super-trait's associated type directly. Instead, this effect is achieved
+///     by constraining the `ArrayAlias` associated type and then constraining that
+///     `Vectorizable::Array == FieldVectorizable::ArrayAlias` where necessary (e.g. in the
+///     definition and blanket impl of the `FieldSimd` trait. We call it `ArrayAlias` instead of
+///     `Array` so that references to the `Array` associated type do not require qualification
+///     with a trait name.
+///  3. `F: Vectorizable<N>`. This is implied by the previous two, because `FieldArray`
+///     is a sub-trait of `SharedValueArray`.
+pub trait FieldVectorizable<const N: usize>: SharedValue + Sized {
+    type ArrayAlias: FieldArray<Self>;
+}
+
+// We could define a `SharedValueSimd` trait that is the analog of this for `SharedValue`s, but
+// there are not currently any protocols that need it.
+pub trait FieldSimd<const N: usize>:
+    Field + Vectorizable<N, Array = <Self as FieldVectorizable<N>>::ArrayAlias> + FieldVectorizable<N>
+{
+}
+
+// Portions of the implementation treat non-vectorized operations as a vector with `N = 1`. This
+// blanket impl (and the fact that `F: Field` is the only trait bound) is important in allowing code
+// that writes `F: Field` to continue working without modification.
+impl<F: Field> FieldSimd<1> for F {}
+
+// Supported vectorizations
+
+impl FieldSimd<32> for Fp32BitPrime {}
+
+pub trait SharedValueArray<V>:
+    Clone
+    + Eq
+    + Debug
+    + Send
+    + Sync
+    + Sized
+    + TryFrom<Vec<V>, Error = ()>
+    + FromIterator<V>
+    + IntoIterator<Item = V>
+    + Add<Self, Output = Self>
+    + for<'a> Add<&'a Self, Output = Self>
+    + AddAssign<Self>
+    + for<'a> AddAssign<&'a Self>
+    + Neg<Output = Self>
+    + Sub<Self, Output = Self>
+    + for<'a> Sub<&'a Self, Output = Self>
+    + SubAssign<Self>
+    + for<'a> SubAssign<&'a Self>
+    + Message
+{
+    const ZERO: Self;
+
+    fn from_fn<F: FnMut(usize) -> V>(f: F) -> Self;
+
+    fn get(&self, index: usize) -> V;
+
+    fn get_mut(&mut self, index: usize) -> &mut V;
+
+    fn set(&mut self, index: usize, value: V);
+}
+
+// Some `SharedValue` types (and thus their arrays) implement `FromRandom`, but `RP25519` does not.
+// We overload this distinction on `FieldArray` instead of creating a separate `ArrayFromRandom` trait,
+// to avoid making the `Vectorizable` / `FieldVectorizable` situation that much more complicated.
+pub trait FieldArray<F: SharedValue>:
+    SharedValueArray<F>
+    + FromRandom
+    + for<'a> Mul<&'a F, Output = Self>
+    + for<'a> Mul<&'a Self, Output = Self>
+{
 }
 
 #[cfg(any(test, feature = "test-fixture", feature = "cli"))]
@@ -85,6 +264,29 @@ where
     }
 }
 
+#[cfg(any(test, feature = "test-fixture", feature = "cli"))]
+impl<V, const N: usize> IntoShares<AdditiveShare<V, N>> for [V; N]
+where
+    V: SharedValue + Vectorizable<N>,
+    Standard: Distribution<V>,
+{
+    fn share_with<R: Rng>(self, rng: &mut R) -> [AdditiveShare<V, N>; 3] {
+        // For arrays large enough that the compiler doesn't just unroll everything, it might be
+        // more efficient to avoid the intermediate vector by implementing this as a specialized
+        // hybrid of the impls for `F as IntoShares<Replicated<F>>` and `<V: IntoIterator> as
+        // IntoShares<Vec<T>>`. Not bothering since this is test-support functionality.
+        let [v1, v2, v3] = self.into_iter().share_with(rng);
+        let (v1l, v1r): (Vec<V>, Vec<V>) = v1.iter().map(AdditiveShare::as_tuple).unzip();
+        let (v2l, v2r): (Vec<V>, Vec<V>) = v2.iter().map(AdditiveShare::as_tuple).unzip();
+        let (v3l, v3r): (Vec<V>, Vec<V>) = v3.iter().map(AdditiveShare::as_tuple).unzip();
+        [
+            AdditiveShare::new_arr(v1l.try_into().unwrap(), v1r.try_into().unwrap()),
+            AdditiveShare::new_arr(v2l.try_into().unwrap(), v2r.try_into().unwrap()),
+            AdditiveShare::new_arr(v3l.try_into().unwrap(), v3r.try_into().unwrap()),
+        ]
+    }
+}
+
 #[cfg(all(test, unit_test))]
 mod tests {
     use crate::{
diff --git a/ipa-core/src/secret_sharing/replicated/semi_honest/additive_share.rs b/ipa-core/src/secret_sharing/replicated/semi_honest/additive_share.rs
index a1011fce6..319a438c7 100644
--- a/ipa-core/src/secret_sharing/replicated/semi_honest/additive_share.rs
+++ b/ipa-core/src/secret_sharing/replicated/semi_honest/additive_share.rs
@@ -1,6 +1,6 @@
 use std::{
     fmt::{Debug, Formatter},
-    ops::{Add, AddAssign, Mul, Neg, Sub, SubAssign},
+    ops::{Add, AddAssign, Mul, Neg, Range, Sub, SubAssign},
 };
 
 use generic_array::{ArrayLength, GenericArray};
@@ -9,23 +9,37 @@ use typenum::Unsigned;
 use crate::{
     ff::{ArrayAccess, Expand, Field, Serializable},
     secret_sharing::{
-        replicated::ReplicatedSecretSharing, Linear as LinearSecretSharing, SecretSharing,
-        SharedValue,
+        replicated::ReplicatedSecretSharing, FieldSimd, Linear as LinearSecretSharing,
+        SecretSharing, SharedValue, SharedValueArray, Vectorizable,
     },
 };
 
+/// Additive secret sharing.
+///
+/// `AdditiveShare` holds two out of three shares of an additive secret sharing, either of a single
+/// value with type `V`, or a vector of such values.
 #[derive(Clone, PartialEq, Eq)]
-pub struct AdditiveShare<V: SharedValue>(V, V);
+pub struct AdditiveShare<V: SharedValue + Vectorizable<N>, const N: usize = 1>(
+    <V as Vectorizable<N>>::Array,
+    <V as Vectorizable<N>>::Array,
+);
 
 #[derive(Clone, PartialEq, Eq)]
-pub struct ASIterator<T: Iterator>(pub T, pub T);
+pub struct ASIterator<'a, S: SharedValue + ArrayAccess> {
+    range: Range<usize>,
+    share: &'a AdditiveShare<S>,
+}
 
-impl<V: SharedValue> SecretSharing<V> for AdditiveShare<V> {
-    const ZERO: Self = AdditiveShare::ZERO;
+impl<V: SharedValue + Vectorizable<N>, const N: usize> SecretSharing<V> for AdditiveShare<V, N> {
+    const ZERO: Self = Self(
+        <V as Vectorizable<N>>::Array::ZERO,
+        <V as Vectorizable<N>>::Array::ZERO,
+    );
 }
-impl<F: Field> LinearSecretSharing<F> for AdditiveShare<F> {}
 
-impl<V: SharedValue + Debug> Debug for AdditiveShare<V> {
+impl<F, const N: usize> LinearSecretSharing<F> for AdditiveShare<F, N> where F: Field + FieldSimd<N> {}
+
+impl<V: SharedValue + Vectorizable<N> + Debug, const N: usize> Debug for AdditiveShare<V, N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "({:?}, {:?})", self.0, self.1)
     }
@@ -37,26 +51,48 @@ impl<V: SharedValue> Default for AdditiveShare<V> {
     }
 }
 
-impl<V: SharedValue> AdditiveShare<V> {
-    /// Replicated secret share where both left and right values are `F::ZERO`
-    pub const ZERO: Self = Self(V::ZERO, V::ZERO);
+impl<V: SharedValue + Vectorizable<N>, const N: usize> AdditiveShare<V, N> {
+    /// Replicated secret share where both left and right values are `V::ZERO`
+    pub const ZERO: Self = Self(
+        <V as Vectorizable<N>>::Array::ZERO,
+        <V as Vectorizable<N>>::Array::ZERO,
+    );
+}
 
+impl<V: SharedValue> AdditiveShare<V> {
     pub fn as_tuple(&self) -> (V, V) {
-        (self.0, self.1)
+        (V::from_array(&self.0), V::from_array(&self.1))
     }
 }
 
-impl<V: SharedValue> ReplicatedSecretSharing<V> for AdditiveShare<V> {
+impl<V> ReplicatedSecretSharing<V> for AdditiveShare<V>
+where
+    V: SharedValue + Vectorizable<1>,
+{
     fn new(a: V, b: V) -> Self {
-        Self(a, b)
+        Self(a.into_array(), b.into_array())
     }
 
     fn left(&self) -> V {
-        self.0
+        V::from_array(&self.0)
     }
 
     fn right(&self) -> V {
-        self.1
+        V::from_array(&self.1)
+    }
+}
+
+impl<V: SharedValue + Vectorizable<N>, const N: usize> AdditiveShare<V, N> {
+    pub fn new_arr(a: <V as Vectorizable<N>>::Array, b: <V as Vectorizable<N>>::Array) -> Self {
+        Self(a, b)
+    }
+
+    pub fn left_arr(&self) -> &<V as Vectorizable<N>>::Array {
+        &self.0
+    }
+
+    pub fn right_arr(&self) -> &<V as Vectorizable<N>>::Array {
+        &self.1
     }
 }
 
@@ -86,15 +122,20 @@ where
     }
 }
 
-impl<'a, 'b, V: SharedValue> Add<&'b AdditiveShare<V>> for &'a AdditiveShare<V> {
-    type Output = AdditiveShare<V>;
+impl<'a, 'b, V: SharedValue + Vectorizable<N>, const N: usize> Add<&'b AdditiveShare<V, N>>
+    for &'a AdditiveShare<V, N>
+{
+    type Output = AdditiveShare<V, N>;
 
-    fn add(self, rhs: &'b AdditiveShare<V>) -> Self::Output {
-        AdditiveShare(self.0 + rhs.0, self.1 + rhs.1)
+    fn add(self, rhs: &'b AdditiveShare<V, N>) -> Self::Output {
+        AdditiveShare(
+            Add::add(self.0.clone(), &rhs.0),
+            Add::add(self.1.clone(), &rhs.1),
+        )
     }
 }
 
-impl<V: SharedValue> Add<Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Add<Self> for AdditiveShare<V, N> {
     type Output = Self;
 
     fn add(self, rhs: Self) -> Self::Output {
@@ -102,15 +143,19 @@ impl<V: SharedValue> Add<Self> for AdditiveShare<V> {
     }
 }
 
-impl<V: SharedValue> Add<AdditiveShare<V>> for &AdditiveShare<V> {
-    type Output = AdditiveShare<V>;
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Add<AdditiveShare<V, N>>
+    for &AdditiveShare<V, N>
+{
+    type Output = AdditiveShare<V, N>;
 
-    fn add(self, rhs: AdditiveShare<V>) -> Self::Output {
+    fn add(self, rhs: AdditiveShare<V, N>) -> Self::Output {
         Add::add(self, &rhs)
     }
 }
 
-impl<V: SharedValue> Add<&AdditiveShare<V>> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Add<&AdditiveShare<V, N>>
+    for AdditiveShare<V, N>
+{
     type Output = Self;
 
     fn add(self, rhs: &Self) -> Self::Output {
@@ -118,28 +163,28 @@ impl<V: SharedValue> Add<&AdditiveShare<V>> for AdditiveShare<V> {
     }
 }
 
-impl<V: SharedValue> AddAssign<&Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> AddAssign<&Self> for AdditiveShare<V, N> {
     fn add_assign(&mut self, rhs: &Self) {
-        self.0 += rhs.0;
-        self.1 += rhs.1;
+        self.0 += &rhs.0;
+        self.1 += &rhs.1;
     }
 }
 
-impl<V: SharedValue> AddAssign<Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> AddAssign<Self> for AdditiveShare<V, N> {
     fn add_assign(&mut self, rhs: Self) {
         AddAssign::add_assign(self, &rhs);
     }
 }
 
-impl<V: SharedValue> Neg for &AdditiveShare<V> {
-    type Output = AdditiveShare<V>;
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Neg for &AdditiveShare<V, N> {
+    type Output = AdditiveShare<V, N>;
 
     fn neg(self) -> Self::Output {
-        AdditiveShare(-self.0, -self.1)
+        AdditiveShare(-self.0.clone(), -self.1.clone())
     }
 }
 
-impl<V: SharedValue> Neg for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Neg for AdditiveShare<V, N> {
     type Output = Self;
 
     fn neg(self) -> Self::Output {
@@ -147,15 +192,18 @@ impl<V: SharedValue> Neg for AdditiveShare<V> {
     }
 }
 
-impl<V: SharedValue> Sub<Self> for &AdditiveShare<V> {
-    type Output = AdditiveShare<V>;
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Sub<Self> for &AdditiveShare<V, N> {
+    type Output = AdditiveShare<V, N>;
 
     fn sub(self, rhs: Self) -> Self::Output {
-        AdditiveShare(self.0 - rhs.0, self.1 - rhs.1)
+        AdditiveShare(
+            Sub::sub(self.0.clone(), &rhs.0),
+            Sub::sub(self.1.clone(), &rhs.1),
+        )
     }
 }
 
-impl<V: SharedValue> Sub<Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Sub<Self> for AdditiveShare<V, N> {
     type Output = Self;
 
     fn sub(self, rhs: Self) -> Self::Output {
@@ -163,7 +211,7 @@ impl<V: SharedValue> Sub<Self> for AdditiveShare<V> {
     }
 }
 
-impl<V: SharedValue> Sub<&Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Sub<&Self> for AdditiveShare<V, N> {
     type Output = Self;
 
     fn sub(self, rhs: &Self) -> Self::Output {
@@ -171,53 +219,64 @@ impl<V: SharedValue> Sub<&Self> for AdditiveShare<V> {
     }
 }
 
-impl<V: SharedValue> Sub<AdditiveShare<V>> for &AdditiveShare<V> {
-    type Output = AdditiveShare<V>;
+impl<V: SharedValue + Vectorizable<N>, const N: usize> Sub<AdditiveShare<V, N>>
+    for &AdditiveShare<V, N>
+{
+    type Output = AdditiveShare<V, N>;
 
-    fn sub(self, rhs: AdditiveShare<V>) -> Self::Output {
+    fn sub(self, rhs: AdditiveShare<V, N>) -> Self::Output {
         Sub::sub(self, &rhs)
     }
 }
 
-impl<V: SharedValue> SubAssign<&Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> SubAssign<&Self> for AdditiveShare<V, N> {
     fn sub_assign(&mut self, rhs: &Self) {
-        self.0 -= rhs.0;
-        self.1 -= rhs.1;
+        self.0 -= &rhs.0;
+        self.1 -= &rhs.1;
     }
 }
 
-impl<V: SharedValue> SubAssign<Self> for AdditiveShare<V> {
+impl<V: SharedValue + Vectorizable<N>, const N: usize> SubAssign<Self> for AdditiveShare<V, N> {
     fn sub_assign(&mut self, rhs: Self) {
         SubAssign::sub_assign(self, &rhs);
     }
 }
 
-impl<'a, 'b, F: Field> Mul<&'b F> for &'a AdditiveShare<F> {
-    type Output = AdditiveShare<F>;
+impl<'a, 'b, F, const N: usize> Mul<&'b F> for &'a AdditiveShare<F, N>
+where
+    F: Field + FieldSimd<N>,
+{
+    type Output = AdditiveShare<F, N>;
 
     fn mul(self, rhs: &'b F) -> Self::Output {
-        AdditiveShare(self.0 * *rhs, self.1 * *rhs)
+        AdditiveShare(self.0.clone() * rhs, self.1.clone() * rhs)
     }
 }
 
-impl<F: Field> Mul<F> for AdditiveShare<F> {
+impl<F: Field, const N: usize> Mul<F> for AdditiveShare<F, N>
+where
+    F: Field + FieldSimd<N>,
+{
     type Output = Self;
 
     fn mul(self, rhs: F) -> Self::Output {
-        Mul::mul(&self, &rhs)
+        Mul::mul(&self, rhs)
     }
 }
 
-impl<F: Field> Mul<&F> for AdditiveShare<F> {
+impl<'a, F: Field + FieldSimd<N>, const N: usize> Mul<&'a F> for AdditiveShare<F, N> {
     type Output = Self;
 
     fn mul(self, rhs: &F) -> Self::Output {
-        Mul::mul(&self, rhs)
+        Mul::mul(&self, *rhs)
     }
 }
 
-impl<F: Field> Mul<F> for &AdditiveShare<F> {
-    type Output = AdditiveShare<F>;
+impl<F, const N: usize> Mul<F> for &AdditiveShare<F, N>
+where
+    F: Field + FieldSimd<N>,
+{
+    type Output = AdditiveShare<F, N>;
 
     fn mul(self, rhs: F) -> Self::Output {
         Mul::mul(self, &rhs)
@@ -230,11 +289,15 @@ impl<V: SharedValue> From<(V, V)> for AdditiveShare<V> {
     }
 }
 
-impl<V: std::ops::Not<Output = V> + SharedValue> std::ops::Not for AdditiveShare<V> {
+impl<V, const N: usize> std::ops::Not for AdditiveShare<V, N>
+where
+    V: SharedValue + Vectorizable<N>,
+    <V as Vectorizable<N>>::Array: std::ops::Not<Output = <V as Vectorizable<N>>::Array>,
+{
     type Output = Self;
 
     fn not(self) -> Self::Output {
-        AdditiveShare(!(self.0), !(self.1))
+        AdditiveShare(!self.0, !self.1)
     }
 }
 
@@ -261,55 +324,79 @@ where
 }
 
 /// Implement `ArrayAccess` for `AdditiveShare` over `SharedValue` that implements `ArrayAccess`
-impl<S> ArrayAccess for AdditiveShare<S>
+// You can think of S as a Boolean array type and V as Boolean.
+impl<S, V, A> ArrayAccess for AdditiveShare<S>
 where
-    S: ArrayAccess + SharedValue,
-    <S as ArrayAccess>::Output: SharedValue,
+    S: SharedValue + ArrayAccess<Output = V>,
+    V: SharedValue + Vectorizable<1, Array = A>,
+    A: SharedValueArray<V>,
 {
-    type Output = AdditiveShare<<S as ArrayAccess>::Output>;
-    type Iter<'a> = ASIterator<S::Iter<'a>>;
+    type Output = AdditiveShare<V>;
+    type Iter<'a> = ASIterator<'a, S>;
 
     fn get(&self, index: usize) -> Option<Self::Output> {
-        self.0
+        S::from_array(&self.0)
             .get(index)
-            .zip(self.1.get(index))
-            .map(|v| AdditiveShare(v.0, v.1))
+            .zip(S::from_array(&self.1).get(index))
+            .map(|v| AdditiveShare(v.0.into_array(), v.1.into_array()))
     }
 
     fn set(&mut self, index: usize, e: Self::Output) {
-        self.0.set(index, e.0);
-        self.1.set(index, e.1);
+        S::from_array_mut(&mut self.0).set(index, V::from_array(&e.0));
+        S::from_array_mut(&mut self.1).set(index, V::from_array(&e.1));
     }
 
     fn iter(&self) -> Self::Iter<'_> {
-        ASIterator(self.0.iter(), self.1.iter())
+        ASIterator {
+            range: Range {
+                start: 0,
+                end: S::from_array(&self.0).iter().len(),
+            },
+            share: self,
+        }
     }
 }
 
-impl<S> Expand for AdditiveShare<S>
+impl<S, A, T> Expand for AdditiveShare<S>
 where
-    S: Expand + SharedValue,
-    <S as Expand>::Input: SharedValue,
+    S: Expand<Input = T> + SharedValue + Vectorizable<1, Array = A>,
+    A: SharedValueArray<S>,
+    T: SharedValue,
 {
     type Input = AdditiveShare<<S as Expand>::Input>;
 
     fn expand(v: &Self::Input) -> Self {
-        AdditiveShare(S::expand(&v.0), S::expand(&v.1))
+        AdditiveShare(
+            S::expand(&T::from_array(&v.0)).into_array(),
+            S::expand(&T::from_array(&v.1)).into_array(),
+        )
     }
 }
 
-impl<T> Iterator for ASIterator<T>
+impl<'a, S, T> Iterator for ASIterator<'a, S>
 where
-    T: Iterator,
-    T::Item: SharedValue,
+    S: SharedValue + ArrayAccess<Output = T>,
+    T: SharedValue,
 {
-    type Item = AdditiveShare<T::Item>;
+    type Item = AdditiveShare<T>;
 
     fn next(&mut self) -> Option<Self::Item> {
-        match (self.0.next(), self.1.next()) {
-            (Some(left), Some(right)) => Some(AdditiveShare(left, right)),
-            _ => None,
-        }
+        self.range.next().map(|i| {
+            AdditiveShare(
+                S::from_array(&self.share.0).get(i).unwrap().into_array(),
+                S::from_array(&self.share.1).get(i).unwrap().into_array(),
+            )
+        })
+    }
+}
+
+impl<'a, S> ExactSizeIterator for ASIterator<'a, S>
+where
+    S: SharedValue + ArrayAccess,
+    <S as ArrayAccess>::Output: SharedValue,
+{
+    fn len(&self) -> usize {
+        self.range.len()
     }
 }
 
@@ -370,8 +457,14 @@ mod tests {
         a3: &AdditiveShare<Fp31>,
         expected_value: u128,
     ) {
-        assert_eq!(a1.0 + a2.0 + a3.0, Fp31::truncate_from(expected_value));
-        assert_eq!(a1.1 + a2.1 + a3.1, Fp31::truncate_from(expected_value));
+        assert_eq!(
+            a1.left() + a2.left() + a3.left(),
+            Fp31::truncate_from(expected_value)
+        );
+        assert_eq!(
+            a1.right() + a2.right() + a3.right(),
+            Fp31::truncate_from(expected_value)
+        );
     }
 
     fn addition_test_case(a: (u8, u8, u8), b: (u8, u8, u8), expected_output: u128) {
diff --git a/ipa-core/src/test_fixture/circuit.rs b/ipa-core/src/test_fixture/circuit.rs
index 8e8e4ac99..1fce74ead 100644
--- a/ipa-core/src/test_fixture/circuit.rs
+++ b/ipa-core/src/test_fixture/circuit.rs
@@ -1,4 +1,5 @@
 use futures_util::future::join_all;
+use rand::distributions::{Distribution, Standard};
 
 use super::join3v;
 use crate::{
@@ -10,18 +11,20 @@ use crate::{
         RecordId,
     },
     rand::thread_rng,
-    secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, IntoShares},
-    test_fixture::{narrow_contexts, Reconstruct, TestWorld},
+    secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, IntoShares},
+    test_fixture::{narrow_contexts, ReconstructArr, TestWorld},
 };
 
 /// Creates an arithmetic circuit with the given width and depth.
 ///
 /// # Panics
 /// panics when circuits did not produce the expected value.
-pub async fn arithmetic<F>(width: u32, depth: u8)
+pub async fn arithmetic<F, const N: usize>(width: u32, depth: u16)
 where
-    F: Field + IntoShares<Replicated<F>>,
-    for<'a> Replicated<F>: SecureMul<SemiHonestContext<'a>>,
+    F: Field + FieldSimd<N>,
+    for<'a> Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
+    [F; N]: IntoShares<Replicated<F, N>>,
+    Standard: Distribution<F>,
 {
     let world = TestWorld::default();
     // Re-use contexts for the entire execution because record identifiers are contiguous.
@@ -35,31 +38,40 @@ where
 
     #[allow(clippy::disallowed_methods)] // Just for testing purposes.
     let results = join_all(multiplications).await;
-    let mut sum = 0;
+    let mut sum = [0u128; N];
     for line in results {
-        sum += line.reconstruct().as_u128();
+        for (this_sum, this_value) in sum.iter_mut().zip(line.reconstruct_arr()) {
+            *this_sum += this_value.as_u128();
+        }
     }
 
-    assert_eq!(sum, u128::from(width));
+    assert_eq!(sum, [u128::from(width); N]);
 }
 
-async fn circuit<'a, F>(
+async fn circuit<'a, F, const N: usize>(
     top_ctx: &[SemiHonestContext<'a>; 3],
     record_id: RecordId,
-    depth: u8,
-) -> [Replicated<F>; 3]
+    depth: u16,
+) -> [Replicated<F, N>; 3]
 where
-    F: Field + IntoShares<Replicated<F>>,
-    Replicated<F>: SecureMul<SemiHonestContext<'a>>,
+    F: Field + FieldSimd<N>,
+    Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
+    [F; N]: IntoShares<Replicated<F, N>>,
 {
-    let mut a = F::ONE.share_with(&mut thread_rng());
+    assert_eq!(
+        depth % u16::try_from(N).unwrap(),
+        0,
+        "depth must be a multiple of vectorization factor"
+    );
+
+    let mut a = [F::ONE; N].share_with(&mut thread_rng());
 
-    for bit in 0..depth {
-        let b = F::ONE.share_with(&mut thread_rng());
-        let bit_ctx = narrow_contexts(top_ctx, &format!("b{bit}"));
+    for stripe in 0..(depth / u16::try_from(N).unwrap()) {
+        let b = [F::ONE; N].share_with(&mut thread_rng());
+        let stripe_ctx = narrow_contexts(top_ctx, &format!("s{stripe}"));
         a = async move {
             let mut coll = Vec::new();
-            for (i, ctx) in bit_ctx.iter().enumerate() {
+            for (i, ctx) in stripe_ctx.iter().enumerate() {
                 let mul = a[i].multiply(
                     &b[i],
                     ctx.narrow("mult")
diff --git a/ipa-core/src/test_fixture/mod.rs b/ipa-core/src/test_fixture/mod.rs
index acfb8f853..e383d4db0 100644
--- a/ipa-core/src/test_fixture/mod.rs
+++ b/ipa-core/src/test_fixture/mod.rs
@@ -23,7 +23,7 @@ pub use event_gen::{Config as EventGeneratorConfig, EventGenerator};
 use futures::TryFuture;
 use rand::{distributions::Standard, prelude::Distribution, rngs::mock::StepRng};
 use rand_core::{CryptoRng, RngCore};
-pub use sharing::{get_bits, into_bits, Reconstruct};
+pub use sharing::{get_bits, into_bits, Reconstruct, ReconstructArr};
 #[cfg(feature = "in-memory-infra")]
 pub use world::{Runner, TestWorld, TestWorldConfig};
 
diff --git a/ipa-core/src/test_fixture/sharing.rs b/ipa-core/src/test_fixture/sharing.rs
index a9ac85cf3..95eba9902 100644
--- a/ipa-core/src/test_fixture/sharing.rs
+++ b/ipa-core/src/test_fixture/sharing.rs
@@ -9,7 +9,7 @@ use crate::{
             semi_honest::AdditiveShare as Replicated,
             ReplicatedSecretSharing,
         },
-        BitDecomposed, SecretSharing,
+        BitDecomposed, FieldSimd, SecretSharing, Vectorizable,
     },
 };
 
@@ -20,7 +20,7 @@ pub fn into_bits<F: PrimeField>(v: F) -> BitDecomposed<F> {
     })
 }
 
-/// Deconstructs a value into N values, one for each bi3t.
+/// Deconstructs a value into N values, one for each bit.
 /// # Panics
 /// It won't
 #[must_use]
@@ -37,6 +37,19 @@ pub trait Reconstruct<T> {
     fn reconstruct(&self) -> T;
 }
 
+/// Alternate version of `Reconstruct` for vectors.
+///
+/// There is no difference in the traits, but this avoids having to add
+/// type annotations everywhere to disambiguate whether a single-bit
+/// result should be reconstructed as `F` or `[F; 1]`.
+pub trait ReconstructArr<T> {
+    /// Validates correctness of the secret sharing scheme.
+    ///
+    /// # Panics
+    /// Panics if the given input is not a valid replicated secret share.
+    fn reconstruct_arr(&self) -> T;
+}
+
 impl<F: Field> Reconstruct<F> for [&Replicated<F>; 3] {
     fn reconstruct(&self) -> F {
         let s0 = &self[0];
@@ -62,6 +75,27 @@ impl<F: Field> Reconstruct<F> for [Replicated<F>; 3] {
     }
 }
 
+impl<F: Field + FieldSimd<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
+    for [Replicated<F, N>; 3]
+{
+    fn reconstruct_arr(&self) -> <F as Vectorizable<N>>::Array {
+        let s0l = self[0].left_arr();
+        let s0r = self[0].right_arr();
+        let s1l = self[1].left_arr();
+        let s1r = self[1].right_arr();
+        let s2l = self[2].left_arr();
+        let s2r = self[2].right_arr();
+
+        assert_eq!(s0l.clone() + s1l + s2l, s0r.clone() + s1r + s2r);
+
+        assert_eq!(s0r, s1l);
+        assert_eq!(s1r, s2l);
+        assert_eq!(s2r, s0l);
+
+        s0l.clone() + s1l + s2l
+    }
+}
+
 impl<T, U, V, W> Reconstruct<(V, W)> for [(T, U); 3]
 where
     for<'t> [&'t T; 3]: Reconstruct<V>,