From f2b698a24055515bc0cb353dcf9afe6e545ebe84 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 6 Aug 2024 15:21:58 +0800 Subject: [PATCH] Improve Dilithium (speed) verification stack usage (#346) * Improve Dilithium (speed) verification stack usage Once upon a time, we wrote a paper on memory-efficient Dilithium [1] which included a speed-optimized version of verification that still included some memory optimizations that don't come at a performance penalty. Unfortunately with the update of the reference code to round 3 that version did not get migrated leading to some complaints about verification memory consumption. I finally found some time to port these. Verficication speed is essentially unchanged, but stack consumption is much better. [1] https://eprint.iacr.org/2020/1278.pdf * update benchmarks --- benchmarks.csv | 24 +++--- benchmarks.md | 24 +++--- crypto_sign/dilithium2/m4f/packing.c | 106 +++++++++++++++++++++++- crypto_sign/dilithium2/m4f/packing.h | 13 +++ crypto_sign/dilithium2/m4f/poly.c | 12 +++ crypto_sign/dilithium2/m4f/poly.h | 2 + crypto_sign/dilithium2/m4f/sign.c | 117 ++++++++++++++++++--------- crypto_sign/dilithium2/m4f/vector.h | 2 + crypto_sign/dilithium2/m4f/vector.s | 53 ++++++++++++ 9 files changed, 289 insertions(+), 64 deletions(-) diff --git a/benchmarks.csv b/benchmarks.csv index 952b95f8..c614c534 100644 --- a/benchmarks.csv +++ b/benchmarks.csv @@ -46,13 +46,13 @@ cross-sha3-r-sdpg-1-small (10 executions),ref,290135,287741,297757,102853622,102 cross-sha3-r-sdpg-3-fast (10 executions),ref,627948,625525,637639,43573841,43565461,43582933,27513830,27493024,27525746 cross-sha3-r-sdpg-5-fast (10 executions),ref,1146280,1142409,1153794,93557878,93547167,93566329,59948216,59857434,60043852 dilithium2 (1000 executions),clean,1874167,1827645,1914566,7493877,3321630,40762756,2062795,2062255,2063222 -dilithium2 (1000 executions),m4f,1426036,1379636,1466394,3807970,1813656,18528070,1417745,1417203,1418192 +dilithium2 (1000 executions),m4f,1425723,1379410,1466445,3835095,1813682,16068642,1421307,1420219,1422056 dilithium2 (1000 executions),m4fstack,1801523,1684895,1902114,12170976,3900911,86281518,3241353,3194028,3281144 dilithium3 (1000 executions),clean,3205551,3204090,3207411,12696585,5097364,74392293,3376992,3376581,3377393 -dilithium3 (1000 executions),m4f,2515969,2514498,2517634,5884832,2917322,25268693,2411257,2410858,2411717 +dilithium3 (1000 executions),m4f,2515915,2514307,2517413,6054094,2917316,27829552,2415526,2414696,2416440 dilithium3 (1000 executions),m4fstack,3412759,3406659,3419247,23673016,6733971,145803146,5733307,5688893,5778120 dilithium5 (1000 executions),clean,5341477,5286872,5395822,15710371,7953367,75940093,5609679,5609217,5610183 -dilithium5 (1000 executions),m4f,4275029,4210286,4329519,7977781,4882524,25936176,4185417,4184925,4185896 +dilithium5 (1000 executions),m4f,4275033,4220989,4350945,8349360,4882552,29688762,4192692,4191427,4193764 dilithium5 (1000 executions),m4fstack,5816287,5474236,6115061,33452872,11170780,185259803,9912851,9845789,9981834 falcon-1024 (10 executions),clean,602066436,377135260,1488065363,136241759,136017549,136556585,1678109,1677732,1678566 falcon-1024 (10 executions),m4-ct,408725773,314885208,712370124,87706019,87549942,87839508,990541,984448,997160 @@ -200,13 +200,13 @@ cross-sha3-r-sdpg-1-small,ref,2328,466400,245512,,,,,, cross-sha3-r-sdpg-3-fast,ref,4032,205080,108236,,,,,, cross-sha3-r-sdpg-5-fast,ref,6824,398600,213436,,,,,, dilithium2,clean,38304,51968,36192,,,,,, -dilithium2,m4f,38296,49416,36220,,,,,, +dilithium2,m4f,38296,49416,9012,,,,,, dilithium2,m4fstack,4408,5072,2704,,,,,, dilithium3,clean,60832,79616,57728,,,,,, -dilithium3,m4f,60824,68864,57720,,,,,, +dilithium3,m4f,60824,68864,9880,,,,,, dilithium3,m4fstack,4408,6608,2704,,,,,, dilithium5,clean,97696,122724,92940,,,,,, -dilithium5,m4f,97688,116076,92932,,,,,, +dilithium5,m4f,97688,116076,11944,,,,,, dilithium5,m4fstack,4408,8136,2712,,,,,, falcon-1024,clean,35076,84604,8776,,,,,, falcon-1024,m4-ct,1156,2508,376,,,,,, @@ -355,13 +355,13 @@ cross-sha3-r-sdpg-1-small,ref,71.8,74.7,78.4,,,,,, cross-sha3-r-sdpg-3-fast,ref,71.7,68.2,68.7,,,,,, cross-sha3-r-sdpg-5-fast,ref,71.1,66.1,66.8,,,,,, dilithium2,clean,61.0,30.9,52.9,,,,,, -dilithium2,m4f,79.9,60.6,76.8,,,,,, +dilithium2,m4f,79.9,60.7,76.6,,,,,, dilithium2,m4fstack,74.8,55.2,40.8,,,,,, dilithium3,clean,64.7,31.3,56.8,,,,,, -dilithium3,m4f,82.3,60.3,79.4,,,,,, +dilithium3,m4f,82.3,60.7,79.2,,,,,, dilithium3,m4fstack,77.1,54.6,41.0,,,,,, dilithium5,clean,67.0,35.7,61.1,,,,,, -dilithium5,m4f,83.5,65.0,81.7,,,,,, +dilithium5,m4f,83.5,65.3,81.6,,,,,, dilithium5,m4fstack,76.1,54.5,42.6,,,,,, falcon-1024,clean,8.9,0.3,23.7,,,,,, falcon-1024,m4-ct,8.6,0.4,32.2,,,,,, @@ -509,13 +509,13 @@ cross-sha3-r-sdpg-1-small,ref,18846,0,208,19054,,,,, cross-sha3-r-sdpg-3-fast,ref,19689,0,208,19897,,,,, cross-sha3-r-sdpg-5-fast,ref,18593,0,208,18801,,,,, dilithium2,clean,8064,0,0,8064,,,,, -dilithium2,m4f,18596,0,0,18596,,,,, +dilithium2,m4f,19180,0,0,19180,,,,, dilithium2,m4fstack,24184,0,0,24184,,,,, dilithium3,clean,7580,0,0,7580,,,,, -dilithium3,m4f,18588,0,0,18588,,,,, +dilithium3,m4f,19188,0,0,19188,,,,, dilithium3,m4fstack,23448,0,0,23448,,,,, dilithium5,clean,7808,0,0,7808,,,,, -dilithium5,m4f,18468,0,0,18468,,,,, +dilithium5,m4f,19096,0,0,19096,,,,, dilithium5,m4fstack,23820,0,0,23820,,,,, falcon-1024,clean,82703,0,0,82703,,,,, falcon-1024,m4-ct,81825,0,79872,161697,,,,, diff --git a/benchmarks.md b/benchmarks.md index afb0ab01..87df4aa4 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -48,13 +48,13 @@ | cross-sha3-r-sdpg-3-fast (10 executions) | ref | AVG: 627,948
MIN: 625,525
MAX: 637,639 | AVG: 43,573,841
MIN: 43,565,461
MAX: 43,582,933 | AVG: 27,513,830
MIN: 27,493,024
MAX: 27,525,746 | | cross-sha3-r-sdpg-5-fast (10 executions) | ref | AVG: 1,146,280
MIN: 1,142,409
MAX: 1,153,794 | AVG: 93,557,878
MIN: 93,547,167
MAX: 93,566,329 | AVG: 59,948,216
MIN: 59,857,434
MAX: 60,043,852 | | dilithium2 (1000 executions) | clean | AVG: 1,874,167
MIN: 1,827,645
MAX: 1,914,566 | AVG: 7,493,877
MIN: 3,321,630
MAX: 40,762,756 | AVG: 2,062,795
MIN: 2,062,255
MAX: 2,063,222 | -| dilithium2 (1000 executions) | m4f | AVG: 1,426,036
MIN: 1,379,636
MAX: 1,466,394 | AVG: 3,807,970
MIN: 1,813,656
MAX: 18,528,070 | AVG: 1,417,745
MIN: 1,417,203
MAX: 1,418,192 | +| dilithium2 (1000 executions) | m4f | AVG: 1,425,723
MIN: 1,379,410
MAX: 1,466,445 | AVG: 3,835,095
MIN: 1,813,682
MAX: 16,068,642 | AVG: 1,421,307
MIN: 1,420,219
MAX: 1,422,056 | | dilithium2 (1000 executions) | m4fstack | AVG: 1,801,523
MIN: 1,684,895
MAX: 1,902,114 | AVG: 12,170,976
MIN: 3,900,911
MAX: 86,281,518 | AVG: 3,241,353
MIN: 3,194,028
MAX: 3,281,144 | | dilithium3 (1000 executions) | clean | AVG: 3,205,551
MIN: 3,204,090
MAX: 3,207,411 | AVG: 12,696,585
MIN: 5,097,364
MAX: 74,392,293 | AVG: 3,376,992
MIN: 3,376,581
MAX: 3,377,393 | -| dilithium3 (1000 executions) | m4f | AVG: 2,515,969
MIN: 2,514,498
MAX: 2,517,634 | AVG: 5,884,832
MIN: 2,917,322
MAX: 25,268,693 | AVG: 2,411,257
MIN: 2,410,858
MAX: 2,411,717 | +| dilithium3 (1000 executions) | m4f | AVG: 2,515,915
MIN: 2,514,307
MAX: 2,517,413 | AVG: 6,054,094
MIN: 2,917,316
MAX: 27,829,552 | AVG: 2,415,526
MIN: 2,414,696
MAX: 2,416,440 | | dilithium3 (1000 executions) | m4fstack | AVG: 3,412,759
MIN: 3,406,659
MAX: 3,419,247 | AVG: 23,673,016
MIN: 6,733,971
MAX: 145,803,146 | AVG: 5,733,307
MIN: 5,688,893
MAX: 5,778,120 | | dilithium5 (1000 executions) | clean | AVG: 5,341,477
MIN: 5,286,872
MAX: 5,395,822 | AVG: 15,710,371
MIN: 7,953,367
MAX: 75,940,093 | AVG: 5,609,679
MIN: 5,609,217
MAX: 5,610,183 | -| dilithium5 (1000 executions) | m4f | AVG: 4,275,029
MIN: 4,210,286
MAX: 4,329,519 | AVG: 7,977,781
MIN: 4,882,524
MAX: 25,936,176 | AVG: 4,185,417
MIN: 4,184,925
MAX: 4,185,896 | +| dilithium5 (1000 executions) | m4f | AVG: 4,275,033
MIN: 4,220,989
MAX: 4,350,945 | AVG: 8,349,360
MIN: 4,882,552
MAX: 29,688,762 | AVG: 4,192,692
MIN: 4,191,427
MAX: 4,193,764 | | dilithium5 (1000 executions) | m4fstack | AVG: 5,816,287
MIN: 5,474,236
MAX: 6,115,061 | AVG: 33,452,872
MIN: 11,170,780
MAX: 185,259,803 | AVG: 9,912,851
MIN: 9,845,789
MAX: 9,981,834 | | falcon-1024 (10 executions) | clean | AVG: 602,066,436
MIN: 377,135,260
MAX: 1,488,065,363 | AVG: 136,241,759
MIN: 136,017,549
MAX: 136,556,585 | AVG: 1,678,109
MIN: 1,677,732
MAX: 1,678,566 | | falcon-1024 (10 executions) | m4-ct | AVG: 408,725,773
MIN: 314,885,208
MAX: 712,370,124 | AVG: 87,706,019
MIN: 87,549,942
MAX: 87,839,508 | AVG: 990,541
MIN: 984,448
MAX: 997,160 | @@ -204,13 +204,13 @@ | cross-sha3-r-sdpg-3-fast | ref | 4,032 | 205,080 | 108,236 | | cross-sha3-r-sdpg-5-fast | ref | 6,824 | 398,600 | 213,436 | | dilithium2 | clean | 38,304 | 51,968 | 36,192 | -| dilithium2 | m4f | 38,296 | 49,416 | 36,220 | +| dilithium2 | m4f | 38,296 | 49,416 | 9,012 | | dilithium2 | m4fstack | 4,408 | 5,072 | 2,704 | | dilithium3 | clean | 60,832 | 79,616 | 57,728 | -| dilithium3 | m4f | 60,824 | 68,864 | 57,720 | +| dilithium3 | m4f | 60,824 | 68,864 | 9,880 | | dilithium3 | m4fstack | 4,408 | 6,608 | 2,704 | | dilithium5 | clean | 97,696 | 122,724 | 92,940 | -| dilithium5 | m4f | 97,688 | 116,076 | 92,932 | +| dilithium5 | m4f | 97,688 | 116,076 | 11,944 | | dilithium5 | m4fstack | 4,408 | 8,136 | 2,712 | | falcon-1024 | clean | 35,076 | 84,604 | 8,776 | | falcon-1024 | m4-ct | 1,156 | 2,508 | 376 | @@ -361,13 +361,13 @@ | cross-sha3-r-sdpg-3-fast | ref | 71.7% | 68.2% | 68.7% | | cross-sha3-r-sdpg-5-fast | ref | 71.1% | 66.1% | 66.8% | | dilithium2 | clean | 61.0% | 30.9% | 52.9% | -| dilithium2 | m4f | 79.9% | 60.6% | 76.8% | +| dilithium2 | m4f | 79.9% | 60.7% | 76.6% | | dilithium2 | m4fstack | 74.8% | 55.2% | 40.8% | | dilithium3 | clean | 64.7% | 31.3% | 56.8% | -| dilithium3 | m4f | 82.3% | 61.4% | 79.4% | +| dilithium3 | m4f | 82.3% | 60.7% | 79.2% | | dilithium3 | m4fstack | 77.1% | 54.6% | 41.0% | | dilithium5 | clean | 67.0% | 35.7% | 61.1% | -| dilithium5 | m4f | 83.5% | 65.0% | 81.7% | +| dilithium5 | m4f | 83.5% | 65.3% | 81.6% | | dilithium5 | m4fstack | 76.1% | 54.5% | 42.6% | | falcon-1024 | clean | 8.9% | 0.3% | 23.7% | | falcon-1024 | m4-ct | 8.6% | 0.4% | 32.2% | @@ -517,13 +517,13 @@ | cross-sha3-r-sdpg-3-fast | ref | 19,689 | 0 | 208 | 19,897 | | cross-sha3-r-sdpg-5-fast | ref | 18,593 | 0 | 208 | 18,801 | | dilithium2 | clean | 8,064 | 0 | 0 | 8,064 | -| dilithium2 | m4f | 18,596 | 0 | 0 | 18,596 | +| dilithium2 | m4f | 19,180 | 0 | 0 | 19,180 | | dilithium2 | m4fstack | 24,184 | 0 | 0 | 24,184 | | dilithium3 | clean | 7,580 | 0 | 0 | 7,580 | -| dilithium3 | m4f | 18,588 | 0 | 0 | 18,588 | +| dilithium3 | m4f | 19,188 | 0 | 0 | 19,188 | | dilithium3 | m4fstack | 23,448 | 0 | 0 | 23,448 | | dilithium5 | clean | 7,808 | 0 | 0 | 7,808 | -| dilithium5 | m4f | 18,468 | 0 | 0 | 18,468 | +| dilithium5 | m4f | 19,096 | 0 | 0 | 19,096 | | dilithium5 | m4fstack | 23,820 | 0 | 0 | 23,820 | | falcon-1024 | clean | 82,703 | 0 | 0 | 82,703 | | falcon-1024 | m4-ct | 81,825 | 0 | 79,872 | 161,697 | diff --git a/crypto_sign/dilithium2/m4f/packing.c b/crypto_sign/dilithium2/m4f/packing.c index 8aaff2a3..eb9d9a3e 100644 --- a/crypto_sign/dilithium2/m4f/packing.c +++ b/crypto_sign/dilithium2/m4f/packing.c @@ -2,6 +2,7 @@ #include "packing.h" #include "polyvec.h" #include "poly.h" +#include /************************************************* * Name: pack_pk @@ -49,6 +50,21 @@ void unpack_pk(uint8_t rho[SEEDBYTES], polyt1_unpack(&t1->vec[i], pk + i*POLYT1_PACKEDBYTES); } +/************************************************* +* Name: unpack_pk_t1 +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const polyvec *t1: pointer to output vector t1 +* - const size_t idx: unpack n'th element from t1 +* - unsigned char pk[]: byte array containing bit-packed pk +**************************************************/ +void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) { + pk += SEEDBYTES; + polyt1_unpack(t1, pk + idx * POLYT1_PACKEDBYTES); +} + + /************************************************* * Name: pack_sk * @@ -283,4 +299,92 @@ int unpack_sig(uint8_t c[CTILDEBYTES], return 1; return 0; -} \ No newline at end of file +} + +/************************************************* +* Name: unpack_sig_c +* +* Description: Unpack only c from signature sig = (z, h, c). +* +* Arguments: - poly *c: pointer to output challenge polynomial +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]) { + for(size_t i = 0; i < CTILDEBYTES; ++i) + c[i] = sig[i]; + sig += CTILDEBYTES; + return 0; +} + +/************************************************* +* Name: unpack_sig_z +* +* Description: Unpack only z from signature sig = (z, h, c). +* +* Arguments: - polyvecl *z: pointer to output vector z +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]) { + sig += CTILDEBYTES; + for (size_t i = 0; i < L; ++i) { + polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + return 0; +} + +/************************************************* +* Name: unpack_sig_h +* +* Description: Unpack only h from signature sig = (z, h, c). +* +* Arguments: - polyveck *h: pointer to output hint vector h +* - const unsigned char sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]) { + sig += CTILDEBYTES; + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + size_t k = 0; + for (size_t i = 0; i < K; ++i) { + for (size_t j = 0; j < N; ++j) { + if (i == idx) { + h->coeffs[j] = 0; + } + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (size_t j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + if (i == idx) { + h->coeffs[sig[j]] = 1; + } + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (size_t j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + return 0; +} + diff --git a/crypto_sign/dilithium2/m4f/packing.h b/crypto_sign/dilithium2/m4f/packing.h index 35553545..78ef2c2c 100644 --- a/crypto_sign/dilithium2/m4f/packing.h +++ b/crypto_sign/dilithium2/m4f/packing.h @@ -2,6 +2,7 @@ #define PACKING_H #include +#include #include "params.h" #include "polyvec.h" #include "smallpoly.h" @@ -24,6 +25,9 @@ void pack_sig(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES], const pol #define unpack_pk DILITHIUM_NAMESPACE(unpack_pk) void unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[CRYPTO_PUBLICKEYBYTES]); +#define unpack_pk_t1 DILITHIUM_NAMESPACE(unpack_pk_t1) +void unpack_pk_t1(poly *t1, size_t idx, const unsigned char pk[CRYPTO_PUBLICKEYBYTES]); + #define unpack_sk DILITHIUM_NAMESPACE(unpack_sk) void unpack_sk(uint8_t rho[SEEDBYTES], uint8_t tr[TRBYTES], @@ -36,6 +40,15 @@ void unpack_sk(uint8_t rho[SEEDBYTES], #define unpack_sig DILITHIUM_NAMESPACE(unpack_sig) int unpack_sig(uint8_t c[CTILDEBYTES], polyvecl *z, polyveck *h, const uint8_t sig[CRYPTO_BYTES]); + +#define unpack_sig_z DILITHIUM_NAMESPACE(unpack_sig_z) +int unpack_sig_z(polyvecl *z, const unsigned char sig[CRYPTO_BYTES]); +#define unpack_sig_h DILITHIUM_NAMESPACE(unpack_sig_h) +int unpack_sig_h(poly *h, size_t idx, const unsigned char sig[CRYPTO_BYTES]); +#define unpack_sig_c DILITHIUM_NAMESPACE(unpack_sig_c) +int unpack_sig_c(uint8_t c[CTILDEBYTES], const unsigned char sig[CRYPTO_BYTES]); + + #define pack_sig_c DILITHIUM_NAMESPACE(pack_sig_c) void pack_sig_c(uint8_t sig[CRYPTO_BYTES], const uint8_t c[CTILDEBYTES]); diff --git a/crypto_sign/dilithium2/m4f/poly.c b/crypto_sign/dilithium2/m4f/poly.c index 0d40fda3..654f4f23 100644 --- a/crypto_sign/dilithium2/m4f/poly.c +++ b/crypto_sign/dilithium2/m4f/poly.c @@ -45,6 +45,18 @@ void poly_caddq(poly *a) { asm_caddq(a->coeffs); } +/************************************************* +* Name: poly_csubq +* +* Description: For all coefficients of input polynomial subtract Q if +* coefficient is bigger than Q; add Q if coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void poly_csubq(poly *a) { + asm_caddq(a->coeffs); +} + #if 0 /************************************************* * Name: poly_freeze diff --git a/crypto_sign/dilithium2/m4f/poly.h b/crypto_sign/dilithium2/m4f/poly.h index 8f8819b0..af9e7a50 100644 --- a/crypto_sign/dilithium2/m4f/poly.h +++ b/crypto_sign/dilithium2/m4f/poly.h @@ -12,6 +12,8 @@ typedef struct { void poly_reduce(poly *a); #define poly_caddq DILITHIUM_NAMESPACE(poly_caddq) void poly_caddq(poly *a); +#define poly_csubq DILITHIUM_NAMESPACE(poly_csubq) +void poly_csubq(poly *a); #define poly_freeze DILITHIUM_NAMESPACE(poly_freeze) void poly_freeze(poly *a); diff --git a/crypto_sign/dilithium2/m4f/sign.c b/crypto_sign/dilithium2/m4f/sign.c index 04bec45c..d1c5222b 100644 --- a/crypto_sign/dilithium2/m4f/sign.c +++ b/crypto_sign/dilithium2/m4f/sign.c @@ -225,20 +225,36 @@ int crypto_sign(uint8_t *sm, *smlen += mlen; return 0; } +/************************************************* + * Name: expand_mat_elem + * + * Description: Implementation of ExpandA. Generates matrix A with uniformly + * random coefficients a_{i,j} by performing rejection + * sampling on the output stream of SHAKE128(rho|i|j). + * + * Arguments: - poly mat_elem: output matrix element + * - const unsigned char rho[]: byte array containing seed rho + * - k_idx: matrix row index + * - l_idx: matrix col index + **************************************************/ +static void expand_mat_elem(poly *mat_elem, const unsigned char rho[SEEDBYTES], size_t k_idx, size_t l_idx) +{ + poly_uniform(mat_elem, rho, (uint16_t)((k_idx << 8) + l_idx)); +} /************************************************* -* Name: crypto_sign_verify -* -* Description: Verifies signature. -* -* Arguments: - uint8_t *m: pointer to input signature -* - size_t siglen: length of signature -* - const uint8_t *m: pointer to message -* - size_t mlen: length of message -* - const uint8_t *pk: pointer to bit-packed public key -* -* Returns 0 if signature could be verified correctly and -1 otherwise -**************************************************/ + * Name: crypto_sign_verify + * + * Description: Verifies signature. + * + * Arguments: - uint8_t *m: pointer to input signature + * - size_t siglen: length of signature + * - const uint8_t *m: pointer to message + * - size_t mlen: length of message + * - const uint8_t *pk: pointer to bit-packed public key + * + * Returns 0 if signature could be verified correctly and -1 otherwise + **************************************************/ int crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, @@ -246,23 +262,23 @@ int crypto_sign_verify(const uint8_t *sig, const uint8_t *pk) { unsigned int i; - uint8_t buf[K*POLYW1_PACKEDBYTES]; - uint8_t rho[SEEDBYTES]; + const uint8_t *rho = pk; uint8_t mu[CRHBYTES]; uint8_t c[CTILDEBYTES]; uint8_t c2[CTILDEBYTES]; poly cp; - polyvecl mat[K], z; - polyveck t1, w1, h; + polyvecl z; shake256incctx state; - if(siglen != CRYPTO_BYTES) + poly tmp_elem, w1_elem; + + if (siglen != CRYPTO_BYTES) return -1; - unpack_pk(rho, &t1, pk); - if(unpack_sig(c, &z, &h, sig)) + if (unpack_sig_z(&z, sig) != 0) { return -1; - if(polyvecl_chknorm(&z, GAMMA1 - BETA)) + } + if (polyvecl_chknorm(&z, GAMMA1 - BETA)) return -1; /* Compute CRH(h(rho, t1), msg) */ @@ -273,35 +289,58 @@ int crypto_sign_verify(const uint8_t *sig, shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); + // Hash [mu || w1'] to get c. + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + if (unpack_sig_c(c, sig) != 0) { + return -1; + } poly_challenge(&cp, c); - polyvec_matrix_expand(mat, rho); - + poly_ntt(&cp); polyvecl_ntt(&z); - polyvec_matrix_pointwise_montgomery(&w1, mat, &z); - poly_ntt(&cp); - polyveck_shiftl(&t1); - polyveck_ntt(&t1); - polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); - polyveck_sub(&w1, &w1, &t1); - polyveck_reduce(&w1); - polyveck_invntt_tomont(&w1); + for (size_t k_idx = 0; k_idx < K; k_idx++) + { + // Sample the current element from A. + expand_mat_elem(&tmp_elem, rho, k_idx, 0); + poly_pointwise_montgomery(&w1_elem, &tmp_elem, &z.vec[0]); + + for (size_t l_idx = 1; l_idx < L; l_idx++) + { + // Sample the element from A. + expand_mat_elem(&tmp_elem, rho, k_idx, l_idx); + poly_pointwise_acc_montgomery(&w1_elem, &tmp_elem, &z.vec[l_idx]); + } + + // Subtract c*(t1_{k_idx} * 2^d) + unpack_pk_t1(&tmp_elem, k_idx, pk); + poly_shiftl(&tmp_elem); + poly_ntt(&tmp_elem); + poly_pointwise_montgomery(&tmp_elem, &cp, &tmp_elem); + poly_sub(&w1_elem, &w1_elem, &tmp_elem); + poly_reduce(&w1_elem); + poly_invntt_tomont(&w1_elem); + + // Reconstruct w1 + poly_csubq(&w1_elem); + if (unpack_sig_h(&tmp_elem, k_idx, sig) != 0) { + return -1; + } + poly_use_hint(&w1_elem, &w1_elem, &tmp_elem); + uint8_t w1_packed[POLYW1_PACKEDBYTES]; + polyw1_pack(w1_packed, &w1_elem); + shake256_inc_absorb(&state, w1_packed, POLYW1_PACKEDBYTES); + } - /* Reconstruct w1 */ - polyveck_caddq(&w1); - polyveck_use_hint(&w1, &w1, &h); - polyveck_pack_w1(buf, &w1); /* Call random oracle and verify challenge */ - shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, CRHBYTES); - shake256_inc_absorb(&state, buf, K*POLYW1_PACKEDBYTES); shake256_inc_finalize(&state); shake256_inc_squeeze(c2, CTILDEBYTES, &state); - for(i = 0; i < CTILDEBYTES; ++i) - if(c[i] != c2[i]) + for (i = 0; i < CTILDEBYTES; ++i) + if (c[i] != c2[i]) return -1; return 0; diff --git a/crypto_sign/dilithium2/m4f/vector.h b/crypto_sign/dilithium2/m4f/vector.h index e5c5dda3..183ddc83 100644 --- a/crypto_sign/dilithium2/m4f/vector.h +++ b/crypto_sign/dilithium2/m4f/vector.h @@ -10,6 +10,8 @@ void asm_reduce32(int32_t a[N]); void small_asm_reduce32_central(int32_t a[N]); #define asm_caddq DILITHIUM_NAMESPACE(asm_caddq) void asm_caddq(int32_t a[N]); +#define asm_csubq DILITHIUM_NAMESPACE(asm_csubq) +void asm_csubq(int32_t a[N]); #define asm_freeze DILITHIUM_NAMESPACE(asm_freeze) void asm_freeze(int32_t a[N]); #define asm_rej_uniform DILITHIUM_NAMESPACE(asm_rej_uniform) diff --git a/crypto_sign/dilithium2/m4f/vector.s b/crypto_sign/dilithium2/m4f/vector.s index 559f11b0..a393c914 100644 --- a/crypto_sign/dilithium2/m4f/vector.s +++ b/crypto_sign/dilithium2/m4f/vector.s @@ -169,6 +169,59 @@ pqcrystals_dilithium_asm_caddq: bx lr .size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq +.macro csubq a, tmp, q + cmp.n \a, \q + it ge + subge.w \a, \a, \q + cmp \a, #0 + it mi + addmi.w \a, \a, \q +.endm + +// void asm_csubq(int32_t a[N]); +.global pqcrystals_dilithium_asm_csubq +.type pqcrystals_dilithium_asm_csubq, %function +.align 2 +pqcrystals_dilithium_asm_csubq: + push {r4-r10} + + movw r12,#:lower16:8380417 + movt r12,#:upper16:8380417 + + movw r10, #32 + 1: + ldr.w r1, [r0] + ldr.w r2, [r0, #1*4] + ldr.w r3, [r0, #2*4] + ldr.w r4, [r0, #3*4] + ldr.w r5, [r0, #4*4] + ldr.w r6, [r0, #5*4] + ldr.w r7, [r0, #6*4] + ldr.w r8, [r0, #7*4] + + csubq r1, r9, r12 + csubq r2, r9, r12 + csubq r3, r9, r12 + csubq r4, r9, r12 + csubq r5, r9, r12 + csubq r6, r9, r12 + csubq r7, r9, r12 + csubq r8, r9, r12 + + str.w r2, [r0, #1*4] + str.w r3, [r0, #2*4] + str.w r4, [r0, #3*4] + str.w r5, [r0, #4*4] + str.w r6, [r0, #5*4] + str.w r7, [r0, #6*4] + str.w r8, [r0, #7*4] + str r1, [r0], #8*4 + subs r10, #1 + bne.w 1b + + pop {r4-r10} + bx lr +.size pqcrystals_dilithium_asm_csubq, .-pqcrystals_dilithium_asm_csubq // asm_rej_uniform(int32_t *a,unsigned int len,const unsigned char *buf, unsigned int buflen); .global pqcrystals_dilithium_asm_rej_uniform