Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dilithium/ML-DSA Stack Optimizations #340

Merged
merged 32 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
44e901c
Init dilithium3 stack optimized variant
dop-amin Mar 15, 2024
80c9e07
Start stack optimization [Passing]
dop-amin Mar 15, 2024
5c5b868
Compress w
dop-amin Mar 15, 2024
926e957
Eliminate z, y
dop-amin Mar 15, 2024
302f7f2
Eliminate cp
dop-amin Mar 15, 2024
3c36dbe
Eliminate s1, s2
dop-amin Mar 15, 2024
f71e025
Eliminate second poly needed for A*y
dop-amin Mar 15, 2024
deeabab
Inline sampling uniform and uniform_gamma1
dop-amin Mar 18, 2024
cbc29cf
Inline hint generation
dop-amin Mar 18, 2024
8468d60
Inline polyw subtraction
dop-amin Mar 18, 2024
b4505e7
Refactor decompose to high/lowbits
dop-amin Mar 18, 2024
f5a8a65
Inline Keccak state
dop-amin Mar 18, 2024
10d4766
Shared buffer for polynomials
dop-amin Mar 18, 2024
2804237
rm 257 FFT
dop-amin Mar 18, 2024
d30a766
Union for small and big poly
dop-amin Mar 18, 2024
a37b5a6
Eliminate some smaller buffers
dop-amin Mar 18, 2024
2bd00ad
Remove asym small mul
dop-amin Mar 18, 2024
77a7572
Stack friendly uniform_gamma1 w/o add
dop-amin Mar 18, 2024
6609f82
Stack optimized Dilithium{2,5}
dop-amin Mar 18, 2024
59724a7
Switch to Plantard-based 769 NTT
dop-amin Mar 19, 2024
0dd789b
First batch of stack opt for Verify
dop-amin Mar 20, 2024
a8c993f
On-the-fly unpacking for z, h
dop-amin Mar 20, 2024
b7ded84
Compress w
dop-amin Mar 20, 2024
e6e164b
rm tmp poly, subtract on wcomp
dop-amin Mar 20, 2024
6ef4fbc
Verify Stack Optimizations
dop-amin Mar 30, 2024
9870bec
rm buffers/unionize in Verify
dop-amin Mar 31, 2024
1d21996
Stack opt key pair
dop-amin Apr 8, 2024
76b16c1
Overlap buffers
dop-amin Apr 8, 2024
e718f2e
Stack optimized challenge generation
dop-amin Apr 8, 2024
a37b311
Match 769 Plantard to m4f code
dop-amin Apr 9, 2024
d401a15
update skiplist
mkannwischer Apr 15, 2024
c013920
update benchmarks
mkannwischer Apr 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
First batch of stack opt for Verify
* On-the-fly matrix generation
* Schoolbook for ct1
* Challenge compression
  • Loading branch information
dop-amin authored and mkannwischer committed Apr 15, 2024
commit 0dd789b5fe2138f40ff741bf1641bc3c683e7090
59 changes: 36 additions & 23 deletions crypto_sign/dilithium3/m4fstack/sign.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,16 +297,19 @@ int crypto_sign_verify(const uint8_t *sig,
const uint8_t *pk)
{
unsigned int i;
uint8_t buf[K*POLYW1_PACKEDBYTES];
uint8_t w1_packed[POLYW1_PACKEDBYTES];
uint8_t rho[SEEDBYTES];
uint8_t mu[CRHBYTES];
uint8_t c[CTILDEBYTES];
uint8_t c2[CTILDEBYTES];
poly cp;
polyvecl mat[K], z;
polyveck t1, w1, h;
polyvecl z;
polyveck h, t1;
poly w1, cp, tmp0;
shake256incctx state;

uint8_t wcomp[768];
uint8_t ccomp[68];

if(siglen != CRYPTO_BYTES)
return -1;

Expand All @@ -325,30 +328,40 @@ int crypto_sign_verify(const uint8_t *sig,
shake256_inc_squeeze(mu, CRHBYTES, &state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
poly_challenge(&cp, c);
polyvec_matrix_expand(mat, rho);

polyvecl_ntt(&z);
polyvec_matrix_pointwise_montgomery(&w1, mat, &z);

poly_challenge(&cp, sig);
poly_challenge_compress(ccomp, &cp);
poly_ntt(&cp);
polyveck_shiftl(&t1);
polyveck_ntt(&t1);
polyveck_pointwise_poly_montgomery(&t1, &cp, &t1);

polyveck_sub(&w1, &w1, &t1);
polyveck_reduce(&w1);
polyveck_invntt_tomont(&w1);

/* Reconstruct w1 */
polyveck_caddq(&w1);
polyveck_use_hint(&w1, &w1, &h);
polyveck_pack_w1(buf, &w1);
polyvecl_ntt(&z);

/* Call random oracle and verify challenge */
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES);

for (size_t k_idx = 0; k_idx < K; k_idx++) {
poly_uniform(&tmp0, rho, (k_idx << 8) + 0);
poly_pointwise_montgomery(&w1, &tmp0, &z.vec[0]);
for (size_t l_idx = 1; l_idx < L; l_idx++) {
poly_uniform(&tmp0, rho, (k_idx << 8) + l_idx);
poly_pointwise_acc_montgomery(&w1, &tmp0, &z.vec[l_idx]);
}

poly_reduce(&w1);
poly_invntt_tomont(&w1);

poly_schoolbook_t1(&tmp0, ccomp, pk + SEEDBYTES + k_idx*POLYT1_PACKEDBYTES);

// TODO invNTT before sub because of schoolbook
poly_sub(&w1, &w1, &tmp0);
poly_reduce(&w1);

/* Reconstruct w1 */
poly_caddq(&w1);
poly_use_hint(&w1, &w1, &h.vec[k_idx]);
polyw1_pack(w1_packed, &w1);

shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES);
}
/* Call random oracle and verify challenge */
shake256_inc_finalize(&state);
shake256_inc_squeeze(c2, CTILDEBYTES, &state);
for(i = 0; i < CTILDEBYTES; ++i)
Expand Down
52 changes: 52 additions & 0 deletions crypto_sign/dilithium3/m4fstack/stack.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,28 @@ static inline int32_t polyt0_unpack_idx(const uint8_t *t0, unsigned idx){
return (1 << (D-1)) - coeff;
}

static inline int32_t polyt1_unpack_idx(const uint8_t *t1, unsigned idx){
int32_t coeff;
// 4 coefficients are packed in 5 bytes
t1 += 5*(idx >> 2);

if(idx % 4 == 0){
coeff = (t1[0] >> 0);
coeff |= ((uint32_t)t1[1] << 8);
} else if(idx % 4 == 1){
coeff = (t1[1] >> 2);
coeff |= ((uint32_t)t1[2] << 6);
} else if(idx % 4 == 2){
coeff = (t1[2] >> 4);
coeff |= ((uint32_t)t1[3] << 4);
} else if(idx % 4 == 3){
coeff = (t1[3] >> 6);
coeff |= ((uint32_t)t1[4] << 2);
}
coeff &= 0x3FF;
return coeff;
}

void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){
unsigned i,j,idx;
uint64_t signs = 0;
Expand Down Expand Up @@ -118,6 +140,36 @@ void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0){
}
}

void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1){
unsigned i,j,idx;
uint64_t signs = 0;
for(i = 0; i < N; i++) c->coeffs[i] = 0;
for(i = 0; i < 8; i++) {
signs |= ((uint64_t)ccomp[60+i]) << (8*i);
}

for(idx = 0; idx < TAU; idx++){
i = ccomp[idx];
if(!(signs & 1)){
for(j = 0; i+j < N; j++){
c->coeffs[i+j] += (polyt1_unpack_idx(t1, j) << D);
}
for(j = N-i; j<N; j++){
c->coeffs[i+j-N] -= (polyt1_unpack_idx(t1, j) << D);
}
} else {
for(j = 0; i+j < N; j++){
c->coeffs[i+j] -= (polyt1_unpack_idx(t1, j) << D);
}
for(j = N-i; j<N; j++){
c->coeffs[i+j-N] += (polyt1_unpack_idx(t1, j) << D);
}
}

signs >>= 1;
}
}


void polyw_pack(uint8_t buf[3*256], poly *w){
poly_reduce(w);
Expand Down
1 change: 1 addition & 0 deletions crypto_sign/dilithium3/m4fstack/stack.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ void poly_challenge_decompress(poly *cp, const uint8_t c[68]);


void poly_schoolbook(poly *c, const uint8_t ccomp[68], const uint8_t *t0);
void poly_schoolbook_t1(poly *c, const uint8_t ccomp[68], const uint8_t *t1);
void polyw_pack(uint8_t buf[3*256], poly *w);
void polyw_unpack(poly *w, const uint8_t buf[3*256]);

Expand Down