From 3c10b1b606551279c966f1226de2d3989ce478c8 Mon Sep 17 00:00:00 2001
From: Andy Leiserson <aleiserson@mozilla.com>
Date: Wed, 21 Feb 2024 17:11:52 -0800
Subject: [PATCH 1/4] Rewritten arithmetic circuit benchmark

The previous version of the benchmark did not use seq_join. Using
seq_join is necessary for parallel execution, and also provides a fairer
baseline for unvectorized serial operation.

This change also generates test data outside the timed region (except
under the iai framework, where that is not supported).
---
 ipa-core/benches/ct/arithmetic_circuit.rs     |  40 +++--
 ipa-core/benches/iai/arithmetic_circuit.rs    |  10 +-
 .../benches/oneshot/arithmetic_circuit.rs     |   3 +-
 ipa-core/src/test_fixture/circuit.rs          | 150 ++++++++++++------
 4 files changed, 140 insertions(+), 63 deletions(-)
diff --git a/ipa-core/benches/ct/arithmetic_circuit.rs b/ipa-core/benches/ct/arithmetic_circuit.rs
index ee7a3ae75..73d45befc 100644
--- a/ipa-core/benches/ct/arithmetic_circuit.rs
+++ b/ipa-core/benches/ct/arithmetic_circuit.rs
@@ -1,6 +1,6 @@
 use criterion::{
-    black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup,
-    BenchmarkId, Criterion, SamplingMode, Throughput,
+    black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize,
+    BenchmarkGroup, BenchmarkId, Criterion, SamplingMode, Throughput,
 };
 use ipa_core::{
     ff::{Field, Fp31, Fp32BitPrime, U128Conversions},
@@ -16,6 +16,7 @@ fn do_benchmark<M, F, const N: usize>(
     group: &mut BenchmarkGroup<M>,
     width: u32,
     depth: u16,
+    active_work: usize,
 ) where
     M: Measurement,
     F: Field + FieldSimd<N> + U128Conversions,
@@ -25,11 +26,24 @@ fn do_benchmark<M, F, const N: usize>(
 {
     group.throughput(Throughput::Elements((width * depth as u32) as u64));
     group.bench_with_input(
-        BenchmarkId::new("circuit", format!("{width}:{depth}:{}x{}", F::NAME, N)),
+        BenchmarkId::new(
+            "circuit",
+            format!("{width}:{depth}:{active_work}:{}x{}", F::NAME, N),
+        ),
         &(width, depth),
         |b, &(width, depth)| {
-            b.to_async(rt)
-                .iter(|| circuit::arithmetic::<F, N>(black_box(width), black_box(depth)));
+            b.to_async(rt).iter_batched(
+                || circuit::arithmetic_setup(width, depth),
+                |input| {
+                    circuit::arithmetic::<F, N>(
+                        black_box(width),
+                        black_box(depth),
+                        active_work,
+                        input,
+                    )
+                },
+                BatchSize::PerIteration,
+            );
         },
     );
 }
@@ -46,14 +60,16 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     group.sample_size(10);
     group.sampling_mode(SamplingMode::Flat);
 
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 512_000, 1);
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 51_200, 10);
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 8_000, 64);
+    // Note that the width parameter (3rd-to-last argument to do_benchmark) must
+    // be a multiple of the vectorization width.
 
-    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 25_600, 10);
-    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 2_560, 100);
-    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_000, 64);
-    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 250, 1_024);
+    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024);
+    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024);
+
+    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024);
+    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024);
+    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32);
+    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32);
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/ipa-core/benches/iai/arithmetic_circuit.rs b/ipa-core/benches/iai/arithmetic_circuit.rs
index ef43e70a0..912c4648d 100644
--- a/ipa-core/benches/iai/arithmetic_circuit.rs
+++ b/ipa-core/benches/iai/arithmetic_circuit.rs
@@ -12,8 +12,16 @@ pub fn iai_benchmark() {
     const CIRCUIT_WIDTH: u32 = 500_000;
     const CIRCUIT_DEPTH: u16 = 1;
 
+    tracing::warn!("test data generation may skew results of this benchmark");
     rt.block_on(async {
-        circuit::arithmetic::<Fp31, 1>(black_box(CIRCUIT_WIDTH), black_box(CIRCUIT_DEPTH)).await;
+        let input = circuit::arithmetic_setup(CIRCUIT_WIDTH, CIRCUIT_DEPTH);
+        circuit::arithmetic::<Fp31, 1>(
+            black_box(CIRCUIT_WIDTH),
+            black_box(CIRCUIT_DEPTH),
+            1024,
+            input,
+        )
+        .await;
     })
 }
 
diff --git a/ipa-core/benches/oneshot/arithmetic_circuit.rs b/ipa-core/benches/oneshot/arithmetic_circuit.rs
index c78a409ea..459bd2e58 100644
--- a/ipa-core/benches/oneshot/arithmetic_circuit.rs
+++ b/ipa-core/benches/oneshot/arithmetic_circuit.rs
@@ -33,8 +33,9 @@ pub async fn main() {
         println!("benchmark parameters: Field size: {field_size} bits, circuit width: {width}, depth: {depth}");
     }
 
+    let input = circuit::arithmetic_setup(args.width, args.depth);
     let start = Instant::now();
-    circuit::arithmetic::<Fp31, 1>(args.width, args.depth).await;
+    circuit::arithmetic::<Fp31, 1>(args.width, args.depth, 1024, input).await;
     let duration = start.elapsed().as_secs_f32();
 
     println!("benchmark complete after {duration}s");
diff --git a/ipa-core/src/test_fixture/circuit.rs b/ipa-core/src/test_fixture/circuit.rs
index 2f1f67cb0..8bb3655b5 100644
--- a/ipa-core/src/test_fixture/circuit.rs
+++ b/ipa-core/src/test_fixture/circuit.rs
@@ -1,10 +1,11 @@
-use futures_util::future::join_all;
+use std::{array, num::NonZeroUsize};
+
+use futures::{future::join3, stream, StreamExt};
 use rand::distributions::{Distribution, Standard};
 
-use super::join3v;
 use crate::{
     ff::{Field, U128Conversions},
-    helpers::TotalRecords,
+    helpers::{GatewayConfig, TotalRecords},
     protocol::{
         basics::SecureMul,
         context::{Context, SemiHonestContext},
@@ -12,78 +13,129 @@ use crate::{
     },
     rand::thread_rng,
     secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, IntoShares},
-    test_fixture::{narrow_contexts, ReconstructArr, TestWorld},
+    seq_join::seq_join,
+    test_fixture::{ReconstructArr, TestWorld, TestWorldConfig},
 };
 
-/// Creates an arithmetic circuit with the given width and depth.
+pub struct Inputs<F: Field + FieldSimd<N>, const N: usize> {
+    a: Replicated<F, N>,
+    b: Vec<Replicated<F, N>>,
+}
+
+impl<F: Field + FieldSimd<N>, const N: usize> Inputs<F, N> {
+    fn new(a: Replicated<F, N>, b: Vec<Replicated<F, N>>) -> Self {
+        Self { a, b }
+    }
+}
+
+/// Generates test data for the arithmetic ciruit benchmark.
 ///
 /// # Panics
-/// panics when circuits did not produce the expected value.
-pub async fn arithmetic<F, const N: usize>(width: u32, depth: u16)
+/// On functional errors, since this is a benchmark.
+#[must_use]
+pub fn arithmetic_setup<F, const N: usize>(width: u32, depth: u16) -> [Vec<Inputs<F, N>>; 3]
 where
+    F: Field + FieldSimd<N>,
+    Standard: Distribution<F>,
+{
+    let mut rng = thread_rng();
+    let mut data = array::from_fn(|_| Vec::with_capacity(width as usize / N));
+    for _ in 0..(width / u32::try_from(N).unwrap()) {
+        let [a0, a1, a2] = [F::ONE; N].share_with(&mut rng);
+        let mut b0 = Vec::with_capacity(depth as usize);
+        let mut b1 = Vec::with_capacity(depth as usize);
+        let mut b2 = Vec::with_capacity(depth as usize);
+        for _ in 0..(depth as usize) {
+            let [s0, s1, s2] = [F::ONE; N].share_with(&mut rng);
+            b0.push(s0);
+            b1.push(s1);
+            b2.push(s2);
+        }
+        data[0].push(Inputs::new(a0, b0));
+        data[1].push(Inputs::new(a1, b1));
+        data[2].push(Inputs::new(a2, b2));
+    }
+    data
+}
+
+/// Creates an arithmetic circuit with the given width and depth.
+///
+/// # Panics
+/// On functional errors, since this is a benchmark.
+pub async fn arithmetic<F, const N: usize>(
+    width: u32,
+    depth: u16,
+    active_work: usize,
+    input_data: [Vec<Inputs<F, N>>; 3],
+) where
     F: Field + FieldSimd<N> + U128Conversions,
     for<'a> Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
     [F; N]: IntoShares<Replicated<F, N>>,
     Standard: Distribution<F>,
 {
-    let world = TestWorld::default();
+    let config = TestWorldConfig {
+        gateway_config: GatewayConfig::new(active_work),
+        ..Default::default()
+    };
+    let world = TestWorld::new_with(config);
+
     // Re-use contexts for the entire execution because record identifiers are contiguous.
     let contexts = world.contexts();
 
-    let mut multiplications = Vec::new();
-    for record in 0..width {
-        let circuit_result = circuit(&contexts, RecordId::from(record), depth);
-        multiplications.push(circuit_result);
-    }
+    let [inp0, inp1, inp2] = input_data;
+
+    let Ok([fut0, fut1, fut2]): Result<[_; 3], _> = contexts
+        .into_iter()
+        .zip([inp0, inp1, inp2])
+        .map(|(ctx, col_data)| {
+            // Setting TotalRecords::Indeterminate causes OrderingSender to make data available to
+            // the channel immediately, instead of doing so only after active_work records have
+            // accumulated. This gives the best performance for vectorized operation.
+            let ctx = ctx.set_total_records(TotalRecords::Indeterminate);
+            seq_join(
+                NonZeroUsize::new(active_work).unwrap(),
+                stream::iter((0..(width / u32::try_from(N).unwrap())).zip(col_data)).map(
+                    move |(record, Inputs { a, b })| {
+                        circuit(ctx.clone(), RecordId::from(record), depth, a, b)
+                    },
+                ),
+            )
+            .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>()
+        .try_into()
+    else {
+        panic!("infallible try_into array");
+    };
 
-    #[allow(clippy::disallowed_methods)] // Just for testing purposes.
-    let results = join_all(multiplications).await;
-    let mut sum = [0u128; N];
-    for line in results {
-        for (this_sum, this_value) in sum.iter_mut().zip(line.reconstruct_arr()) {
-            *this_sum += this_value.as_u128();
+    let (res0, res1, res2) = join3(fut0, fut1, fut2).await;
+
+    let mut sum = 0;
+    for line in res0.into_iter().zip(res1).zip(res2) {
+        let ((s0, s1), s2) = line;
+        for col_sum in [s0, s1, s2].reconstruct_arr() {
+            sum += col_sum.as_u128();
         }
     }
 
-    assert_eq!(sum, [u128::from(width); N]);
+    assert_eq!(sum, u128::from(width));
 }
 
 async fn circuit<'a, F, const N: usize>(
-    top_ctx: &[SemiHonestContext<'a>; 3],
+    ctx: SemiHonestContext<'a>,
     record_id: RecordId,
     depth: u16,
-) -> [Replicated<F, N>; 3]
+    mut a: Replicated<F, N>,
+    b: Vec<Replicated<F, N>>,
+) -> Replicated<F, N>
 where
     F: Field + FieldSimd<N>,
     Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
-    [F; N]: IntoShares<Replicated<F, N>>,
 {
-    assert_eq!(
-        depth % u16::try_from(N).unwrap(),
-        0,
-        "depth must be a multiple of vectorization factor"
-    );
-
-    let mut a = [F::ONE; N].share_with(&mut thread_rng());
-
-    for stripe in 0..(depth / u16::try_from(N).unwrap()) {
-        let b = [F::ONE; N].share_with(&mut thread_rng());
-        let stripe_ctx = narrow_contexts(top_ctx, &format!("s{stripe}"));
-        a = async move {
-            let mut coll = Vec::new();
-            for (i, ctx) in stripe_ctx.iter().enumerate() {
-                let mul = a[i].multiply(
-                    &b[i],
-                    ctx.narrow("mult")
-                        .set_total_records(TotalRecords::Indeterminate),
-                    record_id,
-                );
-                coll.push(mul);
-            }
-
-            join3v(coll).await
-        }
-        .await;
+    assert_eq!(b.len(), usize::from(depth));
+    for (stripe_ix, stripe) in b.iter().enumerate() {
+        let stripe_ctx = ctx.narrow(&format!("s{stripe_ix}"));
+        a = a.multiply(stripe, stripe_ctx, record_id).await.unwrap();
     }
 
     a

From 2276c1b2cfbcfba7f4ee3f85b6a11e223904d42d Mon Sep 17 00:00:00 2001
From: Andy Leiserson <aleiserson@mozilla.com>
Date: Thu, 22 Feb 2024 12:14:42 -0800
Subject: [PATCH 2/4] Add benchmarks to coverage

---
 ipa-core/benches/ct/arithmetic_circuit.rs | 22 ++++++++++++++++------
 pre-commit                                |  2 ++
 scripts/coverage-ci                       |  4 ++++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/ipa-core/benches/ct/arithmetic_circuit.rs b/ipa-core/benches/ct/arithmetic_circuit.rs
index 73d45befc..d3b19c7f3 100644
--- a/ipa-core/benches/ct/arithmetic_circuit.rs
+++ b/ipa-core/benches/ct/arithmetic_circuit.rs
@@ -63,13 +63,23 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     // Note that the width parameter (3rd-to-last argument to do_benchmark) must
     // be a multiple of the vectorization width.
 
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024);
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024);
+    #[cfg(not(coverage))]
+    {
+        do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024);
+        do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024);
 
-    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024);
-    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024);
-    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32);
-    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32);
+        do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024);
+        do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024);
+        do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32);
+        do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32);
+    }
+
+    #[cfg(coverage)]
+    {
+        do_benchmark::<_, Fp31, 1>(&rt, &mut group, 256, 64, 32);
+        do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 256, 64, 32);
+        do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 256, 64, 32);
+    }
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/pre-commit b/pre-commit
index f081393e2..05e65a720 100755
--- a/pre-commit
+++ b/pre-commit
@@ -90,6 +90,8 @@ check "Clippy concurrency checks" \
 check "Clippy web checks" \
     cargo clippy --tests --no-default-features --features "cli web-app real-world-infra test-fixture descriptive-gate" -- -D warnings
 
+# The tests here need to be kept in sync with scripts/coverage-ci.
+
 check "Tests" \
     cargo test
 
diff --git a/scripts/coverage-ci b/scripts/coverage-ci
index eefcbb21d..b7b026534 100755
--- a/scripts/coverage-ci
+++ b/scripts/coverage-ci
@@ -14,4 +14,8 @@ for gate in "compact" "descriptive"; do
   cargo test --no-default-features --features "cli web-app real-world-infra test-fixture $gate-gate"
 done
 
+cargo test --bench oneshot_ipa --no-default-features --features "enable-benches descriptive-gate" -- -n 62 -c 16
+
+cargo test --bench criterion_arithmetic --no-default-features --features "enable-benches descriptive-gate"
+
 cargo llvm-cov report "$@"

From 79642a2f027d4a0ccfe0de9ce1eba6e3aa7ff7ac Mon Sep 17 00:00:00 2001
From: Andy Leiserson <aleiserson@mozilla.com>
Date: Thu, 22 Feb 2024 14:47:33 -0800
Subject: [PATCH 3/4] Fix a trait solver issue that only shows up in coverage
 builds

---
 ipa-core/src/test_fixture/sharing.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ipa-core/src/test_fixture/sharing.rs b/ipa-core/src/test_fixture/sharing.rs
index d120090d4..f1559e123 100644
--- a/ipa-core/src/test_fixture/sharing.rs
+++ b/ipa-core/src/test_fixture/sharing.rs
@@ -8,7 +8,7 @@ use crate::{
             semi_honest::AdditiveShare as Replicated,
             ReplicatedSecretSharing,
         },
-        BitDecomposed, FieldSimd, SharedValue, Vectorizable,
+        BitDecomposed, SharedValue, Vectorizable,
     },
 };
 
@@ -76,7 +76,7 @@ impl<V: SharedValue> Reconstruct<V> for [Replicated<V>; 3] {
     }
 }
 
-impl<F: Field + FieldSimd<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
+impl<F: Field + Vectorizable<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
     for [Replicated<F, N>; 3]
 {
     fn reconstruct_arr(&self) -> <F as Vectorizable<N>>::Array {

From d47cb77e1646782fd018ecfa867590ed5c5d3b02 Mon Sep 17 00:00:00 2001
From: Andy Leiserson <aleiserson@mozilla.com>
Date: Fri, 23 Feb 2024 10:38:29 -0800
Subject: [PATCH 4/4] Use unreachable! instead of panic!

I was hoping that this would exclude it from coverage, but it didn't.
---
 ipa-core/src/test_fixture/circuit.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ipa-core/src/test_fixture/circuit.rs b/ipa-core/src/test_fixture/circuit.rs
index 8bb3655b5..278d57195 100644
--- a/ipa-core/src/test_fixture/circuit.rs
+++ b/ipa-core/src/test_fixture/circuit.rs
@@ -105,7 +105,7 @@ pub async fn arithmetic<F, const N: usize>(
         .collect::<Vec<_>>()
         .try_into()
     else {
-        panic!("infallible try_into array");
+        unreachable!("infallible try_into array");
     };
 
     let (res0, res1, res2) = join3(fut0, fut1, fut2).await;