From 76a026d5a5242be96564ec2af8bec70c2b009280 Mon Sep 17 00:00:00 2001 From: Joe Birr-Pixton Date: Sat, 28 Sep 2024 18:22:05 +0100 Subject: [PATCH 1/2] Add avx2 by-8 bignum_copy_row_from_table specialisation --- graviola/src/low/mod.rs | 2 +- .../bignum_copy_row_from_table_8n_avx2.rs | 61 +++++++++++++++++++ .../x86_64/bignum_copy_row_from_table_mux.rs | 23 +++++++ graviola/src/low/x86_64/mod.rs | 2 + 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 graviola/src/low/x86_64/bignum_copy_row_from_table_8n_avx2.rs create mode 100644 graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs diff --git a/graviola/src/low/mod.rs b/graviola/src/low/mod.rs index 7d5016fa7..3aade7303 100644 --- a/graviola/src/low/mod.rs +++ b/graviola/src/low/mod.rs @@ -42,7 +42,7 @@ cfg_if::cfg_if! { pub(crate) use x86_64::bignum_add_p384::bignum_add_p384; pub(crate) use x86_64::bignum_bitsize::bignum_bitsize; pub(crate) use x86_64::bignum_cmp_lt::bignum_cmp_lt; - pub(crate) use x86_64::bignum_copy_row_from_table::bignum_copy_row_from_table; + pub(crate) use x86_64::bignum_copy_row_from_table_mux::bignum_copy_row_from_table; pub(crate) use x86_64::bignum_demont::bignum_demont; pub(crate) use x86_64::bignum_point_select_p256::{bignum_aff_point_select_p256, bignum_jac_point_select_p256}; pub(crate) use x86_64::bignum_point_select_p384::bignum_jac_point_select_p384; diff --git a/graviola/src/low/x86_64/bignum_copy_row_from_table_8n_avx2.rs b/graviola/src/low/x86_64/bignum_copy_row_from_table_8n_avx2.rs new file mode 100644 index 000000000..00aa162d3 --- /dev/null +++ b/graviola/src/low/x86_64/bignum_copy_row_from_table_8n_avx2.rs @@ -0,0 +1,61 @@ +// Written for Graviola by Joe Birr-Pixton, 2024. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +use core::arch::x86_64::*; + +pub fn bignum_copy_row_from_table_8n_avx2( + z: &mut [u64], + table: &[u64], + _height: u64, + width: u64, + index: u64, +) { + debug_assert!(z.len() as u64 == width); + debug_assert!(width % 8 == 0); + debug_assert!(index < _height); + debug_assert!(table.len() as u64 == _height * width); + + unsafe { _bignum_copy_row_from_table_8n_avx2(z, table, width, index) } +} + +#[target_feature(enable = "avx,avx2")] +unsafe fn _bignum_copy_row_from_table_8n_avx2( + z: &mut [u64], + table: &[u64], + width: u64, + index: u64, +) { + _mm_prefetch(table.as_ptr().cast(), _MM_HINT_T0); + _mm_prefetch(table.as_ptr().add(16).cast(), _MM_HINT_T0); + + z.fill(0); + + let desired_index = _mm_set1_epi64x(index as i64); + let desired_index = _mm256_setr_m128i(desired_index, desired_index); + + let index = _mm_set1_epi64x(0); + let mut index = _mm256_setr_m128i(index, index); + + let ones = _mm_set1_epi64x(1); + let ones = _mm256_setr_m128i(ones, ones); + + for row in table.chunks_exact(width as usize) { + let mask = _mm256_cmpeq_epi64(index, desired_index); + index = _mm256_add_epi64(index, ones); + + for (i, zz) in z.chunks_exact_mut(8).enumerate() { + let row0 = _mm256_loadu_si256(row.as_ptr().add(i * 8).cast()); + let row1 = _mm256_loadu_si256(row.as_ptr().add(i * 8 + 4).cast()); + + let row0 = _mm256_and_si256(row0, mask); + let row1 = _mm256_and_si256(row1, mask); + + let store0 = _mm256_loadu_si256(zz.as_ptr().add(0).cast()); + let store1 = _mm256_loadu_si256(zz.as_ptr().add(4).cast()); + let store0 = _mm256_xor_si256(store0, row0); + let store1 = _mm256_xor_si256(store1, row1); + _mm256_storeu_si256(zz.as_mut_ptr().add(0).cast(), store0); + _mm256_storeu_si256(zz.as_mut_ptr().add(4).cast(), store1); + } + } +} diff --git a/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs b/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs new file mode 100644 index 000000000..b84d81247 --- /dev/null +++ b/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs @@ -0,0 +1,23 @@ +// Written for Graviola by Joe Birr-Pixton, 2024. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +/// Multiplex between specialisations of `bignum_copy_row_from_table` +#[inline] +pub fn bignum_copy_row_from_table( + z: &mut [u64], + table: &[u64], + height: u64, + width: u64, + index: u64, +) { + match width { + width if width % 8 == 0 => { + super::bignum_copy_row_from_table_8n_avx2::bignum_copy_row_from_table_8n_avx2( + z, table, height, width, index, + ) + } + width => super::bignum_copy_row_from_table::bignum_copy_row_from_table( + z, table, height, width, index, + ), + } +} diff --git a/graviola/src/low/x86_64/mod.rs b/graviola/src/low/x86_64/mod.rs index 7de9f111e..09bd0795c 100644 --- a/graviola/src/low/x86_64/mod.rs +++ b/graviola/src/low/x86_64/mod.rs @@ -9,6 +9,8 @@ pub(crate) mod bignum_add_p384; pub(crate) mod bignum_bitsize; pub(crate) mod bignum_cmp_lt; pub(crate) mod bignum_copy_row_from_table; +pub(crate) mod bignum_copy_row_from_table_8n_avx2; +pub(crate) mod bignum_copy_row_from_table_mux; pub(crate) mod bignum_demont; pub(crate) mod bignum_demont_p256; pub(crate) mod bignum_demont_p384; From ff13cf88093f505e0c3aaa9654dce9eaf9bcf60c Mon Sep 17 00:00:00 2001 From: Joe Birr-Pixton Date: Sat, 28 Sep 2024 18:35:23 +0100 Subject: [PATCH 2/2] Add avx2 width=16 bignum_copy_row_from_table specialisation --- .../bignum_copy_row_from_table_16_avx2.rs | 57 +++++++++++++++++++ .../x86_64/bignum_copy_row_from_table_mux.rs | 3 + graviola/src/low/x86_64/mod.rs | 1 + 3 files changed, 61 insertions(+) create mode 100644 graviola/src/low/x86_64/bignum_copy_row_from_table_16_avx2.rs diff --git a/graviola/src/low/x86_64/bignum_copy_row_from_table_16_avx2.rs b/graviola/src/low/x86_64/bignum_copy_row_from_table_16_avx2.rs new file mode 100644 index 000000000..86c317d8e --- /dev/null +++ b/graviola/src/low/x86_64/bignum_copy_row_from_table_16_avx2.rs @@ -0,0 +1,57 @@ +// Written for Graviola by Joe Birr-Pixton, 2024. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +use core::arch::x86_64::*; + +pub fn bignum_copy_row_from_table_16_avx2(z: &mut [u64], table: &[u64], _height: u64, index: u64) { + debug_assert!(z.len() == 16); + debug_assert!(index < _height); + debug_assert!(table.len() == (_height as usize) * z.len()); + + unsafe { _bignum_copy_row_from_table_16_avx2(z, table, index) } +} + +#[target_feature(enable = "avx,avx2")] +unsafe fn _bignum_copy_row_from_table_16_avx2(z: &mut [u64], table: &[u64], index: u64) { + _mm_prefetch(table.as_ptr().cast(), _MM_HINT_T0); + _mm_prefetch(table.as_ptr().add(16).cast(), _MM_HINT_T0); + + let mut acc0 = _mm256_setzero_si256(); + let mut acc1 = _mm256_setzero_si256(); + let mut acc2 = _mm256_setzero_si256(); + let mut acc3 = _mm256_setzero_si256(); + + let desired_index = _mm_set1_epi64x(index as i64); + let desired_index = _mm256_setr_m128i(desired_index, desired_index); + + let index = _mm_set1_epi64x(0); + let mut index = _mm256_setr_m128i(index, index); + + let ones = _mm_set1_epi64x(1); + let ones = _mm256_setr_m128i(ones, ones); + + for row in table.chunks_exact(16) { + let mask = _mm256_cmpeq_epi64(index, desired_index); + index = _mm256_add_epi64(index, ones); + + let row0 = _mm256_loadu_si256(row.as_ptr().add(0).cast()); + let row1 = _mm256_loadu_si256(row.as_ptr().add(4).cast()); + let row2 = _mm256_loadu_si256(row.as_ptr().add(8).cast()); + let row3 = _mm256_loadu_si256(row.as_ptr().add(12).cast()); + + let row0 = _mm256_and_si256(row0, mask); + let row1 = _mm256_and_si256(row1, mask); + let row2 = _mm256_and_si256(row2, mask); + let row3 = _mm256_and_si256(row3, mask); + + acc0 = _mm256_xor_si256(row0, acc0); + acc1 = _mm256_xor_si256(row1, acc1); + acc2 = _mm256_xor_si256(row2, acc2); + acc3 = _mm256_xor_si256(row3, acc3); + } + + _mm256_storeu_si256(z.as_mut_ptr().add(0).cast(), acc0); + _mm256_storeu_si256(z.as_mut_ptr().add(4).cast(), acc1); + _mm256_storeu_si256(z.as_mut_ptr().add(8).cast(), acc2); + _mm256_storeu_si256(z.as_mut_ptr().add(12).cast(), acc3); +} diff --git a/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs b/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs index b84d81247..2d4454c79 100644 --- a/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs +++ b/graviola/src/low/x86_64/bignum_copy_row_from_table_mux.rs @@ -11,6 +11,9 @@ pub fn bignum_copy_row_from_table( index: u64, ) { match width { + 16 => super::bignum_copy_row_from_table_16_avx2::bignum_copy_row_from_table_16_avx2( + z, table, height, index, + ), width if width % 8 == 0 => { super::bignum_copy_row_from_table_8n_avx2::bignum_copy_row_from_table_8n_avx2( z, table, height, width, index, diff --git a/graviola/src/low/x86_64/mod.rs b/graviola/src/low/x86_64/mod.rs index 09bd0795c..52a684d54 100644 --- a/graviola/src/low/x86_64/mod.rs +++ b/graviola/src/low/x86_64/mod.rs @@ -9,6 +9,7 @@ pub(crate) mod bignum_add_p384; pub(crate) mod bignum_bitsize; pub(crate) mod bignum_cmp_lt; pub(crate) mod bignum_copy_row_from_table; +pub(crate) mod bignum_copy_row_from_table_16_avx2; pub(crate) mod bignum_copy_row_from_table_8n_avx2; pub(crate) mod bignum_copy_row_from_table_mux; pub(crate) mod bignum_demont;