Skip to content

Commit

Permalink
Merge pull request #959 from andyleiserson/arithmetic-benchmark
Browse files Browse the repository at this point in the history
Rewritten arithmetic circuit benchmark
  • Loading branch information
andyleiserson authored Mar 6, 2024
2 parents 212ee7b + d47cb77 commit 951bff3
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 65 deletions.
50 changes: 38 additions & 12 deletions ipa-core/benches/ct/arithmetic_circuit.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use criterion::{
black_box, criterion_group, criterion_main, measurement::Measurement, BenchmarkGroup,
BenchmarkId, Criterion, SamplingMode, Throughput,
black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize,
BenchmarkGroup, BenchmarkId, Criterion, SamplingMode, Throughput,
};
use ipa_core::{
ff::{Field, Fp31, Fp32BitPrime, U128Conversions},
Expand All @@ -16,6 +16,7 @@ fn do_benchmark<M, F, const N: usize>(
group: &mut BenchmarkGroup<M>,
width: u32,
depth: u16,
active_work: usize,
) where
M: Measurement,
F: Field + FieldSimd<N> + U128Conversions,
Expand All @@ -25,11 +26,24 @@ fn do_benchmark<M, F, const N: usize>(
{
group.throughput(Throughput::Elements((width * depth as u32) as u64));
group.bench_with_input(
BenchmarkId::new("circuit", format!("{width}:{depth}:{}x{}", F::NAME, N)),
BenchmarkId::new(
"circuit",
format!("{width}:{depth}:{active_work}:{}x{}", F::NAME, N),
),
&(width, depth),
|b, &(width, depth)| {
b.to_async(rt)
.iter(|| circuit::arithmetic::<F, N>(black_box(width), black_box(depth)));
b.to_async(rt).iter_batched(
|| circuit::arithmetic_setup(width, depth),
|input| {
circuit::arithmetic::<F, N>(
black_box(width),
black_box(depth),
active_work,
input,
)
},
BatchSize::PerIteration,
);
},
);
}
Expand All @@ -46,14 +60,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
group.sample_size(10);
group.sampling_mode(SamplingMode::Flat);

do_benchmark::<_, Fp31, 1>(&rt, &mut group, 512_000, 1);
do_benchmark::<_, Fp31, 1>(&rt, &mut group, 51_200, 10);
do_benchmark::<_, Fp31, 1>(&rt, &mut group, 8_000, 64);
// Note that the width parameter (3rd-to-last argument to do_benchmark) must
// be a multiple of the vectorization width.

do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 25_600, 10);
do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 2_560, 100);
do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_000, 64);
do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 250, 1_024);
#[cfg(not(coverage))]
{
do_benchmark::<_, Fp31, 1>(&rt, &mut group, 4_096, 64, 1024);
do_benchmark::<_, Fp31, 1>(&rt, &mut group, 1_024, 256, 1024);

do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 4_096, 64, 1024);
do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 1_024, 256, 1024);
do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 4_096, 64, 32);
do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 1_024, 256, 32);
}

#[cfg(coverage)]
{
do_benchmark::<_, Fp31, 1>(&rt, &mut group, 256, 64, 32);
do_benchmark::<_, Fp32BitPrime, 1>(&rt, &mut group, 256, 64, 32);
do_benchmark::<_, Fp32BitPrime, 32>(&rt, &mut group, 256, 64, 32);
}
}

criterion_group!(benches, criterion_benchmark);
Expand Down
10 changes: 9 additions & 1 deletion ipa-core/benches/iai/arithmetic_circuit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,16 @@ pub fn iai_benchmark() {
const CIRCUIT_WIDTH: u32 = 500_000;
const CIRCUIT_DEPTH: u16 = 1;

tracing::warn!("test data generation may skew results of this benchmark");
rt.block_on(async {
circuit::arithmetic::<Fp31, 1>(black_box(CIRCUIT_WIDTH), black_box(CIRCUIT_DEPTH)).await;
let input = circuit::arithmetic_setup(CIRCUIT_WIDTH, CIRCUIT_DEPTH);
circuit::arithmetic::<Fp31, 1>(
black_box(CIRCUIT_WIDTH),
black_box(CIRCUIT_DEPTH),
1024,
input,
)
.await;
})
}

Expand Down
3 changes: 2 additions & 1 deletion ipa-core/benches/oneshot/arithmetic_circuit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ pub async fn main() {
println!("benchmark parameters: Field size: {field_size} bits, circuit width: {width}, depth: {depth}");
}

let input = circuit::arithmetic_setup(args.width, args.depth);
let start = Instant::now();
circuit::arithmetic::<Fp31, 1>(args.width, args.depth).await;
circuit::arithmetic::<Fp31, 1>(args.width, args.depth, 1024, input).await;
let duration = start.elapsed().as_secs_f32();

println!("benchmark complete after {duration}s");
Expand Down
150 changes: 101 additions & 49 deletions ipa-core/src/test_fixture/circuit.rs
Original file line number Diff line number Diff line change
@@ -1,89 +1,141 @@
use futures_util::future::join_all;
use std::{array, num::NonZeroUsize};

use futures::{future::join3, stream, StreamExt};
use rand::distributions::{Distribution, Standard};

use super::join3v;
use crate::{
ff::{Field, U128Conversions},
helpers::TotalRecords,
helpers::{GatewayConfig, TotalRecords},
protocol::{
basics::SecureMul,
context::{Context, SemiHonestContext},
RecordId,
},
rand::thread_rng,
secret_sharing::{replicated::semi_honest::AdditiveShare as Replicated, FieldSimd, IntoShares},
test_fixture::{narrow_contexts, ReconstructArr, TestWorld},
seq_join::seq_join,
test_fixture::{ReconstructArr, TestWorld, TestWorldConfig},
};

/// Creates an arithmetic circuit with the given width and depth.
pub struct Inputs<F: Field + FieldSimd<N>, const N: usize> {
a: Replicated<F, N>,
b: Vec<Replicated<F, N>>,
}

impl<F: Field + FieldSimd<N>, const N: usize> Inputs<F, N> {
fn new(a: Replicated<F, N>, b: Vec<Replicated<F, N>>) -> Self {
Self { a, b }
}
}

/// Generates test data for the arithmetic ciruit benchmark.
///
/// # Panics
/// panics when circuits did not produce the expected value.
pub async fn arithmetic<F, const N: usize>(width: u32, depth: u16)
/// On functional errors, since this is a benchmark.
#[must_use]
pub fn arithmetic_setup<F, const N: usize>(width: u32, depth: u16) -> [Vec<Inputs<F, N>>; 3]
where
F: Field + FieldSimd<N>,
Standard: Distribution<F>,
{
let mut rng = thread_rng();
let mut data = array::from_fn(|_| Vec::with_capacity(width as usize / N));
for _ in 0..(width / u32::try_from(N).unwrap()) {
let [a0, a1, a2] = [F::ONE; N].share_with(&mut rng);
let mut b0 = Vec::with_capacity(depth as usize);
let mut b1 = Vec::with_capacity(depth as usize);
let mut b2 = Vec::with_capacity(depth as usize);
for _ in 0..(depth as usize) {
let [s0, s1, s2] = [F::ONE; N].share_with(&mut rng);
b0.push(s0);
b1.push(s1);
b2.push(s2);
}
data[0].push(Inputs::new(a0, b0));
data[1].push(Inputs::new(a1, b1));
data[2].push(Inputs::new(a2, b2));
}
data
}

/// Creates an arithmetic circuit with the given width and depth.
///
/// # Panics
/// On functional errors, since this is a benchmark.
pub async fn arithmetic<F, const N: usize>(
width: u32,
depth: u16,
active_work: usize,
input_data: [Vec<Inputs<F, N>>; 3],
) where
F: Field + FieldSimd<N> + U128Conversions,
for<'a> Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
[F; N]: IntoShares<Replicated<F, N>>,
Standard: Distribution<F>,
{
let world = TestWorld::default();
let config = TestWorldConfig {
gateway_config: GatewayConfig::new(active_work),
..Default::default()
};
let world = TestWorld::new_with(config);

// Re-use contexts for the entire execution because record identifiers are contiguous.
let contexts = world.contexts();

let mut multiplications = Vec::new();
for record in 0..width {
let circuit_result = circuit(&contexts, RecordId::from(record), depth);
multiplications.push(circuit_result);
}
let [inp0, inp1, inp2] = input_data;

let Ok([fut0, fut1, fut2]): Result<[_; 3], _> = contexts
.into_iter()
.zip([inp0, inp1, inp2])
.map(|(ctx, col_data)| {
// Setting TotalRecords::Indeterminate causes OrderingSender to make data available to
// the channel immediately, instead of doing so only after active_work records have
// accumulated. This gives the best performance for vectorized operation.
let ctx = ctx.set_total_records(TotalRecords::Indeterminate);
seq_join(
NonZeroUsize::new(active_work).unwrap(),
stream::iter((0..(width / u32::try_from(N).unwrap())).zip(col_data)).map(
move |(record, Inputs { a, b })| {
circuit(ctx.clone(), RecordId::from(record), depth, a, b)
},
),
)
.collect::<Vec<_>>()
})
.collect::<Vec<_>>()
.try_into()
else {
unreachable!("infallible try_into array");
};

#[allow(clippy::disallowed_methods)] // Just for testing purposes.
let results = join_all(multiplications).await;
let mut sum = [0u128; N];
for line in results {
for (this_sum, this_value) in sum.iter_mut().zip(line.reconstruct_arr()) {
*this_sum += this_value.as_u128();
let (res0, res1, res2) = join3(fut0, fut1, fut2).await;

let mut sum = 0;
for line in res0.into_iter().zip(res1).zip(res2) {
let ((s0, s1), s2) = line;
for col_sum in [s0, s1, s2].reconstruct_arr() {
sum += col_sum.as_u128();
}
}

assert_eq!(sum, [u128::from(width); N]);
assert_eq!(sum, u128::from(width));
}

async fn circuit<'a, F, const N: usize>(
top_ctx: &[SemiHonestContext<'a>; 3],
ctx: SemiHonestContext<'a>,
record_id: RecordId,
depth: u16,
) -> [Replicated<F, N>; 3]
mut a: Replicated<F, N>,
b: Vec<Replicated<F, N>>,
) -> Replicated<F, N>
where
F: Field + FieldSimd<N>,
Replicated<F, N>: SecureMul<SemiHonestContext<'a>>,
[F; N]: IntoShares<Replicated<F, N>>,
{
assert_eq!(
depth % u16::try_from(N).unwrap(),
0,
"depth must be a multiple of vectorization factor"
);

let mut a = [F::ONE; N].share_with(&mut thread_rng());

for stripe in 0..(depth / u16::try_from(N).unwrap()) {
let b = [F::ONE; N].share_with(&mut thread_rng());
let stripe_ctx = narrow_contexts(top_ctx, &format!("s{stripe}"));
a = async move {
let mut coll = Vec::new();
for (i, ctx) in stripe_ctx.iter().enumerate() {
let mul = a[i].multiply(
&b[i],
ctx.narrow("mult")
.set_total_records(TotalRecords::Indeterminate),
record_id,
);
coll.push(mul);
}

join3v(coll).await
}
.await;
assert_eq!(b.len(), usize::from(depth));
for (stripe_ix, stripe) in b.iter().enumerate() {
let stripe_ctx = ctx.narrow(&format!("s{stripe_ix}"));
a = a.multiply(stripe, stripe_ctx, record_id).await.unwrap();
}

a
Expand Down
4 changes: 2 additions & 2 deletions ipa-core/src/test_fixture/sharing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::{
semi_honest::AdditiveShare as Replicated,
ReplicatedSecretSharing,
},
BitDecomposed, FieldSimd, SharedValue, Vectorizable,
BitDecomposed, SharedValue, Vectorizable,
},
};

Expand Down Expand Up @@ -76,7 +76,7 @@ impl<V: SharedValue> Reconstruct<V> for [Replicated<V>; 3] {
}
}

impl<F: Field + FieldSimd<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
impl<F: Field + Vectorizable<N>, const N: usize> ReconstructArr<<F as Vectorizable<N>>::Array>
for [Replicated<F, N>; 3]
{
fn reconstruct_arr(&self) -> <F as Vectorizable<N>>::Array {
Expand Down
2 changes: 2 additions & 0 deletions pre-commit
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ check "Clippy concurrency checks" \
check "Clippy web checks" \
cargo clippy --tests --no-default-features --features "cli web-app real-world-infra test-fixture descriptive-gate" -- -D warnings

# The tests here need to be kept in sync with scripts/coverage-ci.

check "Tests" \
cargo test

Expand Down
4 changes: 4 additions & 0 deletions scripts/coverage-ci
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@ for gate in "compact" "descriptive"; do
cargo test --no-default-features --features "cli web-app real-world-infra test-fixture $gate-gate"
done

cargo test --bench oneshot_ipa --no-default-features --features "enable-benches descriptive-gate" -- -n 62 -c 16

cargo test --bench criterion_arithmetic --no-default-features --features "enable-benches descriptive-gate"

cargo llvm-cov report "$@"

0 comments on commit 951bff3

Please sign in to comment.