From 3c10b1b606551279c966f1226de2d3989ce478c8 Mon Sep 17 00:00:00 2001 From: Andy Leiserson Date: Wed, 21 Feb 2024 17:11:52 -0800 Subject: [PATCH 1/4] Rewritten arithmetic circuit benchmark The previous version of the benchmark did not use seq_join. Using seq_join is necessary for parallel execution, and also provides a fairer baseline for unvectorized serial operation. This change also generates test data outside the timed region (except under the iai framework, where that is not supported). --- ipa-core/benches/ct/arithmetic_circuit.rs | 40 +++-- ipa-core/benches/iai/arithmetic_circuit.rs | 10 +- .../benches/oneshot/arithmetic_circuit.rs | 3 +- ipa-core/src/test_fixture/circuit.rs | 150 ++++++++++++------ 4 files changed, 140 insertions(+), 63 deletions(-) diff --git a/ipa-core/benches/ct/arithmetic_circuit.rs b/ipa-core/benches/ct/arithmetic_circuit.rs index ee7a3ae75..73d45befc 100644 --- a/ipa-core/benches/ct/arithmetic_circuit.rs +++ b/ipa-core/benches/ct/arithmetic_circuit.rs @@ -1,6 +1,6 @@ use criterion::{ - black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup, - BenchmarkId, Criterion, SamplingMode, Throughput, + black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, + BenchmarkGroup, BenchmarkId, Criterion, SamplingMode, Throughput, }; use ipa_core::{ ff::{Field, Fp31, Fp32BitPrime, U128Conversions}, @@ -16,6 +16,7 @@ fn do_benchmark( group: &mut BenchmarkGroup, width: u32, depth: u16, + active_work: usize, ) where M: Measurement, F: Field + FieldSimd + U128Conversions, @@ -25,11 +26,24 @@ fn do_benchmark( { group.throughput(Throughput::Elements((width * depth as u32) as u64)); group.bench_with_input( - BenchmarkId::new("circuit", format!("{width}:{depth}:{}x{}", F::NAME, N)), + BenchmarkId::new( + "circuit", + format!("{width}:{depth}:{active_work}:{}x{}", F::NAME, N), + ), &(width, depth), |b, &(width, depth)| { - b.to_async(rt) - .iter(|| circuit::arithmetic::(black_box(width), black_box(depth))); + b.to_async(rt).iter_batched( + || circuit::arithmetic_setup(width, depth), + |input| { + circuit::arithmetic::( + black_box(width), + black_box(depth), + active_work, + input, + ) + }, + BatchSize::PerIteration, + ); }, ); } @@ -46,14 +60,16 @@ pub fn criterion_benchmark(c: &mut Criterion) { group.sample_size(10); group.sampling_mode(SamplingMode::Flat); - do_benchmark::<_, Fp31, 1>(&rt, &mut group, 512_000, 1); - do_benchmark::<_, Fp31, 1>(&rt, &mut group, 51_200, 10); - do_benchmark::<_, Fp31, 1>(&rt, &mut group, 8_000, 64); + // Note that the width parameter (3rd-to-last argument to do_benchmark) must + // be a multiple of the vectorization width. - do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 25_600, 10); - do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 2_560, 100); - do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_000, 64); - do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 250, 1_024); + do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024); + do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024); + + do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024); + do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024); + do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32); + do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32); } criterion_group!(benches, criterion_benchmark); diff --git a/ipa-core/benches/iai/arithmetic_circuit.rs b/ipa-core/benches/iai/arithmetic_circuit.rs index ef43e70a0..912c4648d 100644 --- a/ipa-core/benches/iai/arithmetic_circuit.rs +++ b/ipa-core/benches/iai/arithmetic_circuit.rs @@ -12,8 +12,16 @@ pub fn iai_benchmark() { const CIRCUIT_WIDTH: u32 = 500_000; const CIRCUIT_DEPTH: u16 = 1; + tracing::warn!("test data generation may skew results of this benchmark"); rt.block_on(async { - circuit::arithmetic::(black_box(CIRCUIT_WIDTH), black_box(CIRCUIT_DEPTH)).await; + let input = circuit::arithmetic_setup(CIRCUIT_WIDTH, CIRCUIT_DEPTH); + circuit::arithmetic::( + black_box(CIRCUIT_WIDTH), + black_box(CIRCUIT_DEPTH), + 1024, + input, + ) + .await; }) } diff --git a/ipa-core/benches/oneshot/arithmetic_circuit.rs b/ipa-core/benches/oneshot/arithmetic_circuit.rs index c78a409ea..459bd2e58 100644 --- a/ipa-core/benches/oneshot/arithmetic_circuit.rs +++ b/ipa-core/benches/oneshot/arithmetic_circuit.rs @@ -33,8 +33,9 @@ pub async fn main() { println!("benchmark parameters: Field size: {field_size} bits, circuit width: {width}, depth: {depth}"); } + let input = circuit::arithmetic_setup(args.width, args.depth); let start = Instant::now(); - circuit::arithmetic::(args.width, args.depth).await; + circuit::arithmetic::(args.width, args.depth, 1024, input).await; let duration = start.elapsed().as_secs_f32(); println!("benchmark complete after {duration}s"); diff --git a/ipa-core/src/test_fixture/circuit.rs b/ipa-core/src/test_fixture/circuit.rs index 2f1f67cb0..8bb3655b5 100644 --- a/ipa-core/src/test_fixture/circuit.rs +++ b/ipa-core/src/test_fixture/circuit.rs @@ -1,10 +1,11 @@ -use futures_util::future::join_all; +use std::{array, num::NonZeroUsize}; + +use futures::{future::join3, stream, StreamExt}; use rand::distributions::{Distribution, Standard}; -use super::join3v; use crate::{ ff::{Field, U128Conversions}, - helpers::TotalRecords, + helpers::{GatewayConfig, TotalRecords}, protocol::{ basics::SecureMul, context::{Context, SemiHonestContext}, @@ -12,78 +13,129 @@ use crate::{ }, rand::thread_rng, secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, IntoShares}, - test_fixture::{narrow_contexts, ReconstructArr, TestWorld}, + seq_join::seq_join, + test_fixture::{ReconstructArr, TestWorld, TestWorldConfig}, }; -/// Creates an arithmetic circuit with the given width and depth. +pub struct Inputs, const N: usize> { + a: Replicated, + b: Vec>, +} + +impl, const N: usize> Inputs { + fn new(a: Replicated, b: Vec>) -> Self { + Self { a, b } + } +} + +/// Generates test data for the arithmetic ciruit benchmark. /// /// # Panics -/// panics when circuits did not produce the expected value. -pub async fn arithmetic(width: u32, depth: u16) +/// On functional errors, since this is a benchmark. +#[must_use] +pub fn arithmetic_setup(width: u32, depth: u16) -> [Vec>; 3] where + F: Field + FieldSimd, + Standard: Distribution, +{ + let mut rng = thread_rng(); + let mut data = array::from_fn(|_| Vec::with_capacity(width as usize / N)); + for _ in 0..(width / u32::try_from(N).unwrap()) { + let [a0, a1, a2] = [F::ONE; N].share_with(&mut rng); + let mut b0 = Vec::with_capacity(depth as usize); + let mut b1 = Vec::with_capacity(depth as usize); + let mut b2 = Vec::with_capacity(depth as usize); + for _ in 0..(depth as usize) { + let [s0, s1, s2] = [F::ONE; N].share_with(&mut rng); + b0.push(s0); + b1.push(s1); + b2.push(s2); + } + data[0].push(Inputs::new(a0, b0)); + data[1].push(Inputs::new(a1, b1)); + data[2].push(Inputs::new(a2, b2)); + } + data +} + +/// Creates an arithmetic circuit with the given width and depth. +/// +/// # Panics +/// On functional errors, since this is a benchmark. +pub async fn arithmetic( + width: u32, + depth: u16, + active_work: usize, + input_data: [Vec>; 3], +) where F: Field + FieldSimd + U128Conversions, for<'a> Replicated: SecureMul>, [F; N]: IntoShares>, Standard: Distribution, { - let world = TestWorld::default(); + let config = TestWorldConfig { + gateway_config: GatewayConfig::new(active_work), + ..Default::default() + }; + let world = TestWorld::new_with(config); + // Re-use contexts for the entire execution because record identifiers are contiguous. let contexts = world.contexts(); - let mut multiplications = Vec::new(); - for record in 0..width { - let circuit_result = circuit(&contexts, RecordId::from(record), depth); - multiplications.push(circuit_result); - } + let [inp0, inp1, inp2] = input_data; + + let Ok([fut0, fut1, fut2]): Result<[_; 3], _> = contexts + .into_iter() + .zip([inp0, inp1, inp2]) + .map(|(ctx, col_data)| { + // Setting TotalRecords::Indeterminate causes OrderingSender to make data available to + // the channel immediately, instead of doing so only after active_work records have + // accumulated. This gives the best performance for vectorized operation. + let ctx = ctx.set_total_records(TotalRecords::Indeterminate); + seq_join( + NonZeroUsize::new(active_work).unwrap(), + stream::iter((0..(width / u32::try_from(N).unwrap())).zip(col_data)).map( + move |(record, Inputs { a, b })| { + circuit(ctx.clone(), RecordId::from(record), depth, a, b) + }, + ), + ) + .collect::>() + }) + .collect::>() + .try_into() + else { + panic!("infallible try_into array"); + }; - #[allow(clippy::disallowed_methods)] // Just for testing purposes. - let results = join_all(multiplications).await; - let mut sum = [0u128; N]; - for line in results { - for (this_sum, this_value) in sum.iter_mut().zip(line.reconstruct_arr()) { - *this_sum += this_value.as_u128(); + let (res0, res1, res2) = join3(fut0, fut1, fut2).await; + + let mut sum = 0; + for line in res0.into_iter().zip(res1).zip(res2) { + let ((s0, s1), s2) = line; + for col_sum in [s0, s1, s2].reconstruct_arr() { + sum += col_sum.as_u128(); } } - assert_eq!(sum, [u128::from(width); N]); + assert_eq!(sum, u128::from(width)); } async fn circuit<'a, F, const N: usize>( - top_ctx: &[SemiHonestContext<'a>; 3], + ctx: SemiHonestContext<'a>, record_id: RecordId, depth: u16, -) -> [Replicated; 3] + mut a: Replicated, + b: Vec>, +) -> Replicated where F: Field + FieldSimd, Replicated: SecureMul>, - [F; N]: IntoShares>, { - assert_eq!( - depth % u16::try_from(N).unwrap(), - 0, - "depth must be a multiple of vectorization factor" - ); - - let mut a = [F::ONE; N].share_with(&mut thread_rng()); - - for stripe in 0..(depth / u16::try_from(N).unwrap()) { - let b = [F::ONE; N].share_with(&mut thread_rng()); - let stripe_ctx = narrow_contexts(top_ctx, &format!("s{stripe}")); - a = async move { - let mut coll = Vec::new(); - for (i, ctx) in stripe_ctx.iter().enumerate() { - let mul = a[i].multiply( - &b[i], - ctx.narrow("mult") - .set_total_records(TotalRecords::Indeterminate), - record_id, - ); - coll.push(mul); - } - - join3v(coll).await - } - .await; + assert_eq!(b.len(), usize::from(depth)); + for (stripe_ix, stripe) in b.iter().enumerate() { + let stripe_ctx = ctx.narrow(&format!("s{stripe_ix}")); + a = a.multiply(stripe, stripe_ctx, record_id).await.unwrap(); } a From 2276c1b2cfbcfba7f4ee3f85b6a11e223904d42d Mon Sep 17 00:00:00 2001 From: Andy Leiserson Date: Thu, 22 Feb 2024 12:14:42 -0800 Subject: [PATCH 2/4] Add benchmarks to coverage --- ipa-core/benches/ct/arithmetic_circuit.rs | 22 ++++++++++++++++------ pre-commit | 2 ++ scripts/coverage-ci | 4 ++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/ipa-core/benches/ct/arithmetic_circuit.rs b/ipa-core/benches/ct/arithmetic_circuit.rs index 73d45befc..d3b19c7f3 100644 --- a/ipa-core/benches/ct/arithmetic_circuit.rs +++ b/ipa-core/benches/ct/arithmetic_circuit.rs @@ -63,13 +63,23 @@ pub fn criterion_benchmark(c: &mut Criterion) { // Note that the width parameter (3rd-to-last argument to do_benchmark) must // be a multiple of the vectorization width. - do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024); - do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024); + #[cfg(not(coverage))] + { + do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024); + do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024); - do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024); - do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024); - do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32); - do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32); + do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024); + do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024); + do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32); + do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32); + } + + #[cfg(coverage)] + { + do_benchmark::<_, Fp31, 1>(&rt, &mut group, 256, 64, 32); + do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 256, 64, 32); + do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 256, 64, 32); + } } criterion_group!(benches, criterion_benchmark); diff --git a/pre-commit b/pre-commit index f081393e2..05e65a720 100755 --- a/pre-commit +++ b/pre-commit @@ -90,6 +90,8 @@ check "Clippy concurrency checks" \ check "Clippy web checks" \ cargo clippy --tests --no-default-features --features "cli web-app real-world-infra test-fixture descriptive-gate" -- -D warnings +# The tests here need to be kept in sync with scripts/coverage-ci. + check "Tests" \ cargo test diff --git a/scripts/coverage-ci b/scripts/coverage-ci index eefcbb21d..b7b026534 100755 --- a/scripts/coverage-ci +++ b/scripts/coverage-ci @@ -14,4 +14,8 @@ for gate in "compact" "descriptive"; do cargo test --no-default-features --features "cli web-app real-world-infra test-fixture $gate-gate" done +cargo test --bench oneshot_ipa --no-default-features --features "enable-benches descriptive-gate" -- -n 62 -c 16 + +cargo test --bench criterion_arithmetic --no-default-features --features "enable-benches descriptive-gate" + cargo llvm-cov report "$@" From 79642a2f027d4a0ccfe0de9ce1eba6e3aa7ff7ac Mon Sep 17 00:00:00 2001 From: Andy Leiserson Date: Thu, 22 Feb 2024 14:47:33 -0800 Subject: [PATCH 3/4] Fix a trait solver issue that only shows up in coverage builds --- ipa-core/src/test_fixture/sharing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipa-core/src/test_fixture/sharing.rs b/ipa-core/src/test_fixture/sharing.rs index d120090d4..f1559e123 100644 --- a/ipa-core/src/test_fixture/sharing.rs +++ b/ipa-core/src/test_fixture/sharing.rs @@ -8,7 +8,7 @@ use crate::{ semi_honest::AdditiveShare as Replicated, ReplicatedSecretSharing, }, - BitDecomposed, FieldSimd, SharedValue, Vectorizable, + BitDecomposed, SharedValue, Vectorizable, }, }; @@ -76,7 +76,7 @@ impl Reconstruct for [Replicated; 3] { } } -impl, const N: usize> ReconstructArr<>::Array> +impl, const N: usize> ReconstructArr<>::Array> for [Replicated; 3] { fn reconstruct_arr(&self) -> >::Array { From d47cb77e1646782fd018ecfa867590ed5c5d3b02 Mon Sep 17 00:00:00 2001 From: Andy Leiserson Date: Fri, 23 Feb 2024 10:38:29 -0800 Subject: [PATCH 4/4] Use unreachable! instead of panic! I was hoping that this would exclude it from coverage, but it didn't. --- ipa-core/src/test_fixture/circuit.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipa-core/src/test_fixture/circuit.rs b/ipa-core/src/test_fixture/circuit.rs index 8bb3655b5..278d57195 100644 --- a/ipa-core/src/test_fixture/circuit.rs +++ b/ipa-core/src/test_fixture/circuit.rs @@ -105,7 +105,7 @@ pub async fn arithmetic( .collect::>() .try_into() else { - panic!("infallible try_into array"); + unreachable!("infallible try_into array"); }; let (res0, res1, res2) = join3(fut0, fut1, fut2).await;