Merge pull request #959 from andyleiserson/arithmetic-benchmark

Rewritten arithmetic circuit benchmark
private-attribution · Mar 6, 2024 · 951bff3 · 951bff3
2 parents 212ee7b + d47cb77
commit 951bff3
Show file tree

Hide file tree

Showing 7 changed files with 158 additions and 65 deletions.
diff --git a/ipa-core/benches/ct/arithmetic_circuit.rs b/ipa-core/benches/ct/arithmetic_circuit.rs
@@ -1,6 +1,6 @@
 use criterion::{
-    black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup,
-    BenchmarkId, Criterion, SamplingMode, Throughput,
+    black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize,
+    BenchmarkGroup, BenchmarkId, Criterion, SamplingMode, Throughput,
 };
 use ipa_core::{
     ff::{Field, Fp31, Fp32BitPrime, U128Conversions},
@@ -16,6 +16,7 @@ fn do_benchmark<M, F, const N: usize>(
     group: &mut BenchmarkGroup<M>,
     width: u32,
     depth: u16,
+    active_work: usize,
 ) where
     M: Measurement,
     F: Field + FieldSimd<N> + U128Conversions,
@@ -25,11 +26,24 @@ fn do_benchmark<M, F, const N: usize>(
 {
     group.throughput(Throughput::Elements((width * depth as u32) as u64));
     group.bench_with_input(
-        BenchmarkId::new("circuit", format!("{width}:{depth}:{}x{}", F::NAME, N)),
+        BenchmarkId::new(
+            "circuit",
+            format!("{width}:{depth}:{active_work}:{}x{}", F::NAME, N),
+        ),
         &(width, depth),
         |b, &(width, depth)| {
-            b.to_async(rt)
-                .iter(|| circuit::arithmetic::<F, N>(black_box(width), black_box(depth)));
+            b.to_async(rt).iter_batched(
+                || circuit::arithmetic_setup(width, depth),
+                |input| {
+                    circuit::arithmetic::<F, N>(
+                        black_box(width),
+                        black_box(depth),
+                        active_work,
+                        input,
+                    )
+                },
+                BatchSize::PerIteration,
+            );
         },
     );
 }
@@ -46,14 +60,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     group.sample_size(10);
     group.sampling_mode(SamplingMode::Flat);
 
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 512_000, 1);
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 51_200, 10);
-    do_benchmark::<_, Fp31, 1>(&rt, &mut group, 8_000, 64);
+    // Note that the width parameter (3rd-to-last argument to do_benchmark) must
+    // be a multiple of the vectorization width.
 
-    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 25_600, 10);
-    do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 2_560, 100);
-    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_000, 64);
-    do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 250, 1_024);
+    #[cfg(not(coverage))]
+    {
+        do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024);
+        do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024);
+
+        do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024);
+        do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024);
+        do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32);
+        do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32);
+    }
+
+    #[cfg(coverage)]
+    {
+        do_benchmark::<_, Fp31, 1>(&rt, &mut group, 256, 64, 32);
+        do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 256, 64, 32);
+        do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 256, 64, 32);
+    }
 }
 
 criterion_group!(benches, criterion_benchmark);

diff --git a/ipa-core/benches/iai/arithmetic_circuit.rs b/ipa-core/benches/iai/arithmetic_circuit.rs
@@ -12,8 +12,16 @@ pub fn iai_benchmark() {
     const CIRCUIT_WIDTH: u32 = 500_000;
     const CIRCUIT_DEPTH: u16 = 1;
 
+    tracing::warn!("test data generation may skew results of this benchmark");
     rt.block_on(async {
-        circuit::arithmetic::<Fp31, 1>(black_box(CIRCUIT_WIDTH), black_box(CIRCUIT_DEPTH)).await;
+        let input = circuit::arithmetic_setup(CIRCUIT_WIDTH, CIRCUIT_DEPTH);
+        circuit::arithmetic::<Fp31, 1>(
+            black_box(CIRCUIT_WIDTH),
+            black_box(CIRCUIT_DEPTH),
+            1024,
+            input,
+        )
+        .await;
     })
 }
 

diff --git a/ipa-core/benches/oneshot/arithmetic_circuit.rs b/ipa-core/benches/oneshot/arithmetic_circuit.rs
@@ -33,8 +33,9 @@ pub async fn main() {
         println!("benchmark parameters: Field size: {field_size} bits, circuit width: {width}, depth: {depth}");
     }
 
+    let input = circuit::arithmetic_setup(args.width, args.depth);
     let start = Instant::now();
-    circuit::arithmetic::<Fp31, 1>(args.width, args.depth).await;
+    circuit::arithmetic::<Fp31, 1>(args.width, args.depth, 1024, input).await;
     let duration = start.elapsed().as_secs_f32();
 
     println!("benchmark complete after {duration}s");

diff --git a/ipa-core/src/test_fixture/circuit.rs b/ipa-core/src/test_fixture/circuit.rs
@@ -1,89 +1,141 @@
-use futures_util::future::join_all;
+use std::{array, num::NonZeroUsize};
+
+use futures::{future::join3, stream, StreamExt};
 use rand::distributions::{Distribution, Standard};
 
-use super::join3v;
 use crate::{
     ff::{Field, U128Conversions},
-    helpers::TotalRecords,
+    helpers::{GatewayConfig, TotalRecords},
     protocol::{
         basics::SecureMul,
         context::{Context, SemiHonestContext},
         RecordId,
     },
     rand::thread_rng,
     secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, IntoShares},
-    test_fixture::{narrow_contexts, ReconstructArr, TestWorld},
+    seq_join::seq_join,
+    test_fixture::{ReconstructArr, TestWorld, TestWorldConfig},
 };
 
-/// Creates an arithmetic circuit with the given width and depth.
+pub struct Inputs<F: Field + FieldSimd<N>, const N: usize> {
+    a: Replicated<F, N>,
+    b: Vec<Replicated<F, N>>,
+}
+
+impl<F: Field + FieldSimd<N>, const N: usize> Inputs<F, N> {
+    fn new(a: Replicated<F, N>, b: Vec<Replicated<F, N>>) -> Self {
+        Self { a, b }
+    }
+}
+
+/// Generates test data for the arithmetic ciruit benchmark.
 ///
 /// # Panics
-/// panics when circuits did not produce the expected value.
-pub async fn arithmetic<F, const N: usize>(width: u32, depth: u16)
+/// On functional errors, since this is a benchmark.
+#[must_use]
+pub fn arithmetic_setup<F, const N: usize>(width: u32, depth: u16) -> [Vec<Inputs<F, N>>; 3]
 where
+    F: Field + FieldSimd<N>,
+    Standard: Distribution<F>,
+{
+    let mut rng = thread_rng();
+    let mut data = array::from_fn(|_| Vec::with_capacity(width as usize / N));
+    for _ in 0..(width / u32::try_from(N).unwrap()) {
+        let [a0, a1, a2] = [F::ONE; N].share_with(&mut rng);
+        let mut b0 = Vec::with_capacity(depth as usize);
+        let mut b1 = Vec::with_capacity(depth as usize);
+        let mut b2 = Vec::with_capacity(depth as usize);
+        for _ in 0..(depth as usize) {
+            let [s0, s1, s2] = [F::ONE; N].share_with(&mut rng);
+            b0.push(s0);
+            b1.push(s1);
+            b2.push(s2);
+        }
+        data[0].push(Inputs::new(a0, b0));
+        data[1].push(Inputs::new(a1, b1));
+        data[2].push(Inputs::new(a2, b2));
+    }
+    data
+}
+
+/// Creates an arithmetic circuit with the given width and depth.
+///
+/// # Panics
+/// On functional errors, since this is a benchmark.
+pub async fn arithmetic<F, const N: usize>(
+    width: u32,
+    depth: u16,
+    active_work: usize,
+    input_data: [Vec<Inputs<F, N>>; 3],
+) where
     F: Field + FieldSimd<N> + U128Conversions,
     for<'a> Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
     [F; N]: IntoShares<Replicated<F, N>>,
     Standard: Distribution<F>,
 {
-    let world = TestWorld::default();
+    let config = TestWorldConfig {
+        gateway_config: GatewayConfig::new(active_work),
+        ..Default::default()
+    };
+    let world = TestWorld::new_with(config);
+
     // Re-use contexts for the entire execution because record identifiers are contiguous.
     let contexts = world.contexts();
 
-    let mut multiplications = Vec::new();
-    for record in 0..width {
-        let circuit_result = circuit(&contexts, RecordId::from(record), depth);
-        multiplications.push(circuit_result);
-    }
+    let [inp0, inp1, inp2] = input_data;
+
+    let Ok([fut0, fut1, fut2]): Result<[_; 3], _> = contexts
+        .into_iter()
+        .zip([inp0, inp1, inp2])
+        .map(|(ctx, col_data)| {
+            // Setting TotalRecords::Indeterminate causes OrderingSender to make data available to
+            // the channel immediately, instead of doing so only after active_work records have
+            // accumulated. This gives the best performance for vectorized operation.
+            let ctx = ctx.set_total_records(TotalRecords::Indeterminate);
+            seq_join(
+                NonZeroUsize::new(active_work).unwrap(),
+                stream::iter((0..(width / u32::try_from(N).unwrap())).zip(col_data)).map(
+                    move |(record, Inputs { a, b })| {
+                        circuit(ctx.clone(), RecordId::from(record), depth, a, b)
+                    },
+                ),
+            )
+            .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>()
+        .try_into()
+    else {
+        unreachable!("infallible try_into array");
+    };
 
-    #[allow(clippy::disallowed_methods)] // Just for testing purposes.
-    let results = join_all(multiplications).await;
-    let mut sum = [0u128; N];
-    for line in results {
-        for (this_sum, this_value) in sum.iter_mut().zip(line.reconstruct_arr()) {
-            *this_sum += this_value.as_u128();
+    let (res0, res1, res2) = join3(fut0, fut1, fut2).await;
+
+    let mut sum = 0;
+    for line in res0.into_iter().zip(res1).zip(res2) {
+        let ((s0, s1), s2) = line;
+        for col_sum in [s0, s1, s2].reconstruct_arr() {
+            sum += col_sum.as_u128();
         }
     }
 
-    assert_eq!(sum, [u128::from(width); N]);
+    assert_eq!(sum, u128::from(width));
 }
 
 async fn circuit<'a, F, const N: usize>(
-    top_ctx: &[SemiHonestContext<'a>; 3],
+    ctx: SemiHonestContext<'a>,
     record_id: RecordId,
     depth: u16,
-) -> [Replicated<F, N>; 3]
+    mut a: Replicated<F, N>,
+    b: Vec<Replicated<F, N>>,
+) -> Replicated<F, N>
 where
     F: Field + FieldSimd<N>,
     Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
-    [F; N]: IntoShares<Replicated<F, N>>,
 {
-    assert_eq!(
-        depth % u16::try_from(N).unwrap(),
-        0,
-        "depth must be a multiple of vectorization factor"
-    );
-
-    let mut a = [F::ONE; N].share_with(&mut thread_rng());
-
-    for stripe in 0..(depth / u16::try_from(N).unwrap()) {
-        let b = [F::ONE; N].share_with(&mut thread_rng());
-        let stripe_ctx = narrow_contexts(top_ctx, &format!("s{stripe}"));
-        a = async move {
-            let mut coll = Vec::new();
-            for (i, ctx) in stripe_ctx.iter().enumerate() {
-                let mul = a[i].multiply(
-                    &b[i],
-                    ctx.narrow("mult")
-                        .set_total_records(TotalRecords::Indeterminate),
-                    record_id,
-                );
-                coll.push(mul);
-            }
-
-            join3v(coll).await
-        }
-        .await;
+    assert_eq!(b.len(), usize::from(depth));
+    for (stripe_ix, stripe) in b.iter().enumerate() {
+        let stripe_ctx = ctx.narrow(&format!("s{stripe_ix}"));
+        a = a.multiply(stripe, stripe_ctx, record_id).await.unwrap();
     }
 
     a

diff --git a/ipa-core/src/test_fixture/sharing.rs b/ipa-core/src/test_fixture/sharing.rs
@@ -8,7 +8,7 @@ use crate::{
             semi_honest::AdditiveShare as Replicated,
             ReplicatedSecretSharing,
         },
-        BitDecomposed, FieldSimd, SharedValue, Vectorizable,
+        BitDecomposed, SharedValue, Vectorizable,
     },
 };
 
@@ -76,7 +76,7 @@ impl<V: SharedValue> Reconstruct<V> for [Replicated<V>; 3] {
     }
 }
 
-impl<F: Field + FieldSimd<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
+impl<F: Field + Vectorizable<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
     for [Replicated<F, N>; 3]
 {
     fn reconstruct_arr(&self) -> <F as Vectorizable<N>>::Array {

diff --git a/pre-commit b/pre-commit
@@ -90,6 +90,8 @@ check "Clippy concurrency checks" \
 check "Clippy web checks" \
     cargo clippy --tests --no-default-features --features "cli web-app real-world-infra test-fixture descriptive-gate" -- -D warnings
 
+# The tests here need to be kept in sync with scripts/coverage-ci.
+
 check "Tests" \
     cargo test
 

diff --git a/scripts/coverage-ci b/scripts/coverage-ci
@@ -14,4 +14,8 @@ for gate in "compact" "descriptive"; do
   cargo test --no-default-features --features "cli web-app real-world-infra test-fixture $gate-gate"
 done
 
+cargo test --bench oneshot_ipa --no-default-features --features "enable-benches descriptive-gate" -- -n 62 -c 16
+
+cargo test --bench criterion_arithmetic --no-default-features --features "enable-benches descriptive-gate"
+
 cargo llvm-cov report "$@"