diff --git a/benchmarks.csv b/benchmarks.csv index ddfa372d..4bc2981a 100644 --- a/benchmarks.csv +++ b/benchmarks.csv @@ -8,15 +8,15 @@ bikel3 (10 executions),opt,248083316,248083286,248083345,16405238,16405236,16405 hqc-128 (10 executions),clean,52705201,52705180,52705224,105650897,105650877,105650927,159569179,159569176,159569183 hqc-192 (10 executions),clean,161458617,161458590,161458638,323146261,323146250,323146292,486156251,486156214,486156266 hqc-256 (10 executions),clean,295934078,295934057,295934104,591853870,591853850,591853898,891163005,891162988,891163038 -ml-kem-1024 (10 executions),clean,1540607,1535678,1547404,1712361,1707431,1719164,2024616,2019674,2031410 -ml-kem-1024 (10 executions),m4fspeed,1007759,1004549,1017159,1030702,1027443,1040101,1093153,1089895,1102552 -ml-kem-1024 (10 executions),m4fstack,1010100,1007459,1018999,1038190,1035550,1047090,1101234,1098594,1110134 -ml-kem-512 (10 executions),clean,595882,595636,596329,700689,700440,701134,888740,888494,889189 -ml-kem-512 (10 executions),m4fspeed,388544,387190,398814,392178,390825,402445,429456,428102,439722 -ml-kem-512 (10 executions),m4fstack,386999,386677,387465,392813,392492,393280,430152,429831,430619 -ml-kem-768 (10 executions),clean,990901,986123,997663,1140412,1135630,1147169,1390176,1385399,1396936 -ml-kem-768 (10 executions),m4fspeed,631949,631658,632164,656369,656082,656596,705436,705148,705663 -ml-kem-768 (10 executions),m4fstack,634227,632761,644635,662468,661003,672877,711996,710530,722405 +ml-kem-1024 (10 executions),clean,1536343,1535750,1536698,1708071,1707476,1708427,2020327,2019721,2020672 +ml-kem-1024 (10 executions),m4fspeed,1018976,1014877,1026934,1031565,1027454,1039544,1094008,1089897,1101987 +ml-kem-1024 (10 executions),m4fstack,1020202,1017478,1029553,1037953,1035260,1047298,1100982,1098251,1110327 +ml-kem-512 (10 executions),clean,595793,595576,595971,700605,700383,700779,888653,888436,888831 +ml-kem-512 (10 executions),m4fspeed,392423,392211,392614,390881,390671,391073,428167,427955,428357 +ml-kem-512 (10 executions),m4fstack,392224,391772,392541,392864,392407,393181,430202,429745,430519 +ml-kem-768 (10 executions),clean,988722,985880,998135,1138225,1135419,1147634,1387984,1385144,1397397 +ml-kem-768 (10 executions),m4fspeed,642096,639116,651103,658754,655785,667769,707827,704858,716842 +ml-kem-768 (10 executions),m4fstack,644195,640433,652374,664654,660893,672834,714194,710433,722374 Signature Schemes,,,,,,,,,, Scheme,Implementation,Key Generation [cycles] (mean),Key Generation [cycles] (min),Key Generation [cycles] (max),Sign [cycles] (mean),Sign [cycles] (min),Sign [cycles] (max),Verify [cycles] (mean),Verify [cycles] (min),Verify [cycles] (max) aimer128f (10 executions),m4speed,490087,490087,490088,28590420,28590395,28590439,26750578,26749771,26751014 @@ -315,15 +315,15 @@ bikel3,opt,0.0,6.3,0.2,,,,,, hqc-128,clean,0.4,0.8,0.5,,,,,, hqc-192,clean,0.3,0.5,0.3,,,,,, hqc-256,clean,0.2,0.4,0.3,,,,,, -ml-kem-1024,clean,50.0,45.6,38.6,,,,,, -ml-kem-1024,m4fspeed,76.1,75.5,71.2,,,,,, -ml-kem-1024,m4fstack,75.9,74.9,70.7,,,,,, -ml-kem-512,clean,49.9,41.1,32.5,,,,,, -ml-kem-512,m4fspeed,76.5,73.5,67.2,,,,,, -ml-kem-512,m4fstack,76.6,73.2,66.9,,,,,, -ml-kem-768,clean,48.6,43.2,35.4,,,,,, -ml-kem-768,m4fspeed,75.5,74.4,69.3,,,,,, -ml-kem-768,m4fstack,75.3,73.9,68.7,,,,,, +ml-kem-1024,clean,50.0,45.7,38.6,,,,,, +ml-kem-1024,m4fspeed,75.3,75.5,71.2,,,,,, +ml-kem-1024,m4fstack,75.1,74.9,70.7,,,,,, +ml-kem-512,clean,49.8,41.1,32.5,,,,,, +ml-kem-512,m4fspeed,75.5,73.5,67.1,,,,,, +ml-kem-512,m4fstack,75.6,73.2,66.9,,,,,, +ml-kem-768,clean,48.5,43.2,35.4,,,,,, +ml-kem-768,m4fspeed,74.5,74.4,69.2,,,,,, +ml-kem-768,m4fstack,74.4,73.9,68.7,,,,,, Signature Schemes,,,,,,,,,, Scheme,Implementation,Key Generation [%],Sign [%],Verify [%],,,,,, aimer128f,m4speed,57.9,49.5,50.1,,,,,, @@ -469,14 +469,14 @@ hqc-128,clean,18628,0,0,18628,,,,, hqc-192,clean,21104,0,0,21104,,,,, hqc-256,clean,26260,0,0,26260,,,,, ml-kem-1024,clean,6160,0,0,6160,,,,, -ml-kem-1024,m4fspeed,16912,0,0,16912,,,,, -ml-kem-1024,m4fstack,14012,0,0,14012,,,,, +ml-kem-1024,m4fspeed,16916,0,0,16916,,,,, +ml-kem-1024,m4fstack,14016,0,0,14016,,,,, ml-kem-512,clean,5116,0,0,5116,,,,, -ml-kem-512,m4fspeed,15844,0,0,15844,,,,, -ml-kem-512,m4fstack,13324,0,0,13324,,,,, +ml-kem-512,m4fspeed,15848,0,0,15848,,,,, +ml-kem-512,m4fstack,13328,0,0,13328,,,,, ml-kem-768,clean,5120,0,0,5120,,,,, -ml-kem-768,m4fspeed,16012,0,0,16012,,,,, -ml-kem-768,m4fstack,13316,0,0,13316,,,,, +ml-kem-768,m4fspeed,16016,0,0,16016,,,,, +ml-kem-768,m4fstack,13320,0,0,13320,,,,, Signature Schemes,,,,,,,,,, Scheme,Implementation,.text [bytes],.data [bytes],.bss [bytes],Total [bytes],,,,, aimer128f,m4speed,15992,0,0,15992,,,,, diff --git a/benchmarks.md b/benchmarks.md index e131b348..afe0ae3a 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -9,15 +9,15 @@ | hqc-128 (10 executions) | clean | AVG: 52,705,201
MIN: 52,705,180
MAX: 52,705,224 | AVG: 105,650,897
MIN: 105,650,877
MAX: 105,650,927 | AVG: 159,569,179
MIN: 159,569,176
MAX: 159,569,183 | | hqc-192 (10 executions) | clean | AVG: 161,458,617
MIN: 161,458,590
MAX: 161,458,638 | AVG: 323,146,261
MIN: 323,146,250
MAX: 323,146,292 | AVG: 486,156,251
MIN: 486,156,214
MAX: 486,156,266 | | hqc-256 (10 executions) | clean | AVG: 295,934,078
MIN: 295,934,057
MAX: 295,934,104 | AVG: 591,853,870
MIN: 591,853,850
MAX: 591,853,898 | AVG: 891,163,005
MIN: 891,162,988
MAX: 891,163,038 | -| ml-kem-1024 (10 executions) | clean | AVG: 1,540,607
MIN: 1,535,678
MAX: 1,547,404 | AVG: 1,712,361
MIN: 1,707,431
MAX: 1,719,164 | AVG: 2,024,616
MIN: 2,019,674
MAX: 2,031,410 | -| ml-kem-1024 (10 executions) | m4fspeed | AVG: 1,007,759
MIN: 1,004,549
MAX: 1,017,159 | AVG: 1,030,702
MIN: 1,027,443
MAX: 1,040,101 | AVG: 1,093,153
MIN: 1,089,895
MAX: 1,102,552 | -| ml-kem-1024 (10 executions) | m4fstack | AVG: 1,010,100
MIN: 1,007,459
MAX: 1,018,999 | AVG: 1,038,190
MIN: 1,035,550
MAX: 1,047,090 | AVG: 1,101,234
MIN: 1,098,594
MAX: 1,110,134 | -| ml-kem-512 (10 executions) | clean | AVG: 595,882
MIN: 595,636
MAX: 596,329 | AVG: 700,689
MIN: 700,440
MAX: 701,134 | AVG: 888,740
MIN: 888,494
MAX: 889,189 | -| ml-kem-512 (10 executions) | m4fspeed | AVG: 388,544
MIN: 387,190
MAX: 398,814 | AVG: 392,178
MIN: 390,825
MAX: 402,445 | AVG: 429,456
MIN: 428,102
MAX: 439,722 | -| ml-kem-512 (10 executions) | m4fstack | AVG: 386,999
MIN: 386,677
MAX: 387,465 | AVG: 392,813
MIN: 392,492
MAX: 393,280 | AVG: 430,152
MIN: 429,831
MAX: 430,619 | -| ml-kem-768 (10 executions) | clean | AVG: 990,901
MIN: 986,123
MAX: 997,663 | AVG: 1,140,412
MIN: 1,135,630
MAX: 1,147,169 | AVG: 1,390,176
MIN: 1,385,399
MAX: 1,396,936 | -| ml-kem-768 (10 executions) | m4fspeed | AVG: 631,949
MIN: 631,658
MAX: 632,164 | AVG: 656,369
MIN: 656,082
MAX: 656,596 | AVG: 705,436
MIN: 705,148
MAX: 705,663 | -| ml-kem-768 (10 executions) | m4fstack | AVG: 634,227
MIN: 632,761
MAX: 644,635 | AVG: 662,468
MIN: 661,003
MAX: 672,877 | AVG: 711,996
MIN: 710,530
MAX: 722,405 | +| ml-kem-1024 (10 executions) | clean | AVG: 1,536,343
MIN: 1,535,750
MAX: 1,536,698 | AVG: 1,708,071
MIN: 1,707,476
MAX: 1,708,427 | AVG: 2,020,327
MIN: 2,019,721
MAX: 2,020,672 | +| ml-kem-1024 (10 executions) | m4fspeed | AVG: 1,018,976
MIN: 1,014,877
MAX: 1,026,934 | AVG: 1,031,565
MIN: 1,027,454
MAX: 1,039,544 | AVG: 1,094,008
MIN: 1,089,897
MAX: 1,101,987 | +| ml-kem-1024 (10 executions) | m4fstack | AVG: 1,020,202
MIN: 1,017,478
MAX: 1,029,553 | AVG: 1,037,953
MIN: 1,035,260
MAX: 1,047,298 | AVG: 1,100,982
MIN: 1,098,251
MAX: 1,110,327 | +| ml-kem-512 (10 executions) | clean | AVG: 595,793
MIN: 595,576
MAX: 595,971 | AVG: 700,605
MIN: 700,383
MAX: 700,779 | AVG: 888,653
MIN: 888,436
MAX: 888,831 | +| ml-kem-512 (10 executions) | m4fspeed | AVG: 392,423
MIN: 392,211
MAX: 392,614 | AVG: 390,881
MIN: 390,671
MAX: 391,073 | AVG: 428,167
MIN: 427,955
MAX: 428,357 | +| ml-kem-512 (10 executions) | m4fstack | AVG: 392,224
MIN: 391,772
MAX: 392,541 | AVG: 392,864
MIN: 392,407
MAX: 393,181 | AVG: 430,202
MIN: 429,745
MAX: 430,519 | +| ml-kem-768 (10 executions) | clean | AVG: 988,722
MIN: 985,880
MAX: 998,135 | AVG: 1,138,225
MIN: 1,135,419
MAX: 1,147,634 | AVG: 1,387,984
MIN: 1,385,144
MAX: 1,397,397 | +| ml-kem-768 (10 executions) | m4fspeed | AVG: 642,096
MIN: 639,116
MAX: 651,103 | AVG: 658,754
MIN: 655,785
MAX: 667,769 | AVG: 707,827
MIN: 704,858
MAX: 716,842 | +| ml-kem-768 (10 executions) | m4fstack | AVG: 644,195
MIN: 640,433
MAX: 652,374 | AVG: 664,654
MIN: 660,893
MAX: 672,834 | AVG: 714,194
MIN: 710,433
MAX: 722,374 | ## Signature Schemes | scheme | implementation | key generation [cycles] | sign [cycles] | verify [cycles] | | ------ | -------------- | ----------------------- | ------------- | --------------- | @@ -319,15 +319,15 @@ | hqc-128 | clean | 0.4% | 0.8% | 0.5% | | hqc-192 | clean | 0.3% | 0.5% | 0.3% | | hqc-256 | clean | 0.2% | 0.4% | 0.3% | -| ml-kem-1024 | clean | 50.0% | 45.6% | 38.6% | -| ml-kem-1024 | m4fspeed | 76.1% | 75.5% | 71.2% | -| ml-kem-1024 | m4fstack | 75.9% | 74.9% | 70.7% | -| ml-kem-512 | clean | 49.9% | 41.1% | 32.5% | -| ml-kem-512 | m4fspeed | 76.5% | 73.5% | 67.2% | -| ml-kem-512 | m4fstack | 76.6% | 73.2% | 66.9% | -| ml-kem-768 | clean | 48.6% | 43.2% | 35.4% | -| ml-kem-768 | m4fspeed | 75.5% | 74.4% | 69.3% | -| ml-kem-768 | m4fstack | 75.3% | 73.9% | 68.7% | +| ml-kem-1024 | clean | 50.0% | 45.7% | 38.6% | +| ml-kem-1024 | m4fspeed | 75.3% | 75.5% | 71.2% | +| ml-kem-1024 | m4fstack | 75.1% | 74.9% | 70.7% | +| ml-kem-512 | clean | 49.8% | 41.1% | 32.5% | +| ml-kem-512 | m4fspeed | 75.5% | 73.5% | 67.1% | +| ml-kem-512 | m4fstack | 75.6% | 73.2% | 66.9% | +| ml-kem-768 | clean | 48.5% | 43.2% | 35.4% | +| ml-kem-768 | m4fspeed | 74.5% | 74.4% | 69.2% | +| ml-kem-768 | m4fstack | 74.4% | 73.9% | 68.7% | ## Signature Schemes | Scheme | Implementation | Key Generation [%] | Sign [%] | Verify [%] | | ------ | -------------- | ------------------ | -------- | ---------- | @@ -475,14 +475,14 @@ | hqc-192 | clean | 21,104 | 0 | 0 | 21,104 | | hqc-256 | clean | 26,260 | 0 | 0 | 26,260 | | ml-kem-1024 | clean | 6,160 | 0 | 0 | 6,160 | -| ml-kem-1024 | m4fspeed | 16,912 | 0 | 0 | 16,912 | -| ml-kem-1024 | m4fstack | 14,012 | 0 | 0 | 14,012 | +| ml-kem-1024 | m4fspeed | 16,916 | 0 | 0 | 16,916 | +| ml-kem-1024 | m4fstack | 14,016 | 0 | 0 | 14,016 | | ml-kem-512 | clean | 5,116 | 0 | 0 | 5,116 | -| ml-kem-512 | m4fspeed | 15,844 | 0 | 0 | 15,844 | -| ml-kem-512 | m4fstack | 13,324 | 0 | 0 | 13,324 | +| ml-kem-512 | m4fspeed | 15,848 | 0 | 0 | 15,848 | +| ml-kem-512 | m4fstack | 13,328 | 0 | 0 | 13,328 | | ml-kem-768 | clean | 5,120 | 0 | 0 | 5,120 | -| ml-kem-768 | m4fspeed | 16,012 | 0 | 0 | 16,012 | -| ml-kem-768 | m4fstack | 13,316 | 0 | 0 | 13,316 | +| ml-kem-768 | m4fspeed | 16,016 | 0 | 0 | 16,016 | +| ml-kem-768 | m4fstack | 13,320 | 0 | 0 | 13,320 | ## Signature Schemes | Scheme | Implementation | .text [bytes] | .data [bytes] | .bss [bytes] | Total [bytes] | | ------ | -------------- | ------------- | ------------- | ------------ | ------------- | diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.c b/crypto_kem/ml-kem-512/m4fspeed/poly.c index 401b26b7..5a6d0abb 100644 --- a/crypto_kem/ml-kem-512/m4fspeed/poly.c +++ b/crypto_kem/ml-kem-512/m4fspeed/poly.c @@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) { int i; uint16_t t0, t1; + poly_reduce(a); poly_reduce(a); for (i = 0; i < KYBER_N / 2; i++) { diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.c b/crypto_kem/ml-kem-512/m4fstack/poly.c index 443fdbae..ed54ec87 100644 --- a/crypto_kem/ml-kem-512/m4fstack/poly.c +++ b/crypto_kem/ml-kem-512/m4fstack/poly.c @@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) { int i; uint16_t t0, t1; + poly_reduce(a); poly_reduce(a); for (i = 0; i < KYBER_N / 2; i++) { diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.c b/crypto_kem/ml-kem-768/m4fspeed/poly.c index b52060f9..b909a85c 100644 --- a/crypto_kem/ml-kem-768/m4fspeed/poly.c +++ b/crypto_kem/ml-kem-768/m4fspeed/poly.c @@ -142,7 +142,7 @@ void poly_packcompress(unsigned char *r, poly *a, int i) { d0 >>= 31; t[k] = d0 & 0x7ff; } - + r[352*i+11*j+ 0] = t[0] & 0xff; r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); @@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) { int i; uint16_t t0, t1; + poly_reduce(a); poly_reduce(a); for (i = 0; i < KYBER_N / 2; i++) { @@ -465,7 +466,7 @@ void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add * Using strategy of better accumulation (initial step). * Arguments: - const poly *a: pointer to input polynomial * - const poly *b: pointer to input polynomial -* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - const poly *a_prime: pointer to a pre-multiplied by zetas * - int32_t *r_tmp: array for accumulating unreduced results **************************************************/ extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); @@ -481,7 +482,7 @@ void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const * Using strategy of better accumulation. * Arguments: - const poly *a: pointer to input polynomial * - const poly *b: pointer to input polynomial -* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - const poly *a_prime: pointer to a pre-multiplied by zetas * - int32_t *r_tmp: array for accumulating unreduced results **************************************************/ extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *); @@ -497,7 +498,7 @@ void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, co * Using strategy of better accumulation (final step). * Arguments: - const poly *a: pointer to input polynomial * - const poly *b: pointer to input polynomial -* - const poly *a_prime: pointer to a pre-multiplied by zetas +* - const poly *a_prime: pointer to a pre-multiplied by zetas * - poly *r: pointer to output polynomial * - int32_t *r_tmp: array for accumulating unreduced results **************************************************/ diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.c b/crypto_kem/ml-kem-768/m4fstack/poly.c index 35475adb..fb13d155 100644 --- a/crypto_kem/ml-kem-768/m4fstack/poly.c +++ b/crypto_kem/ml-kem-768/m4fstack/poly.c @@ -142,7 +142,7 @@ void poly_packcompress(unsigned char *r, poly *a, int i) { d0 >>= 31; t[k] = d0 & 0x7ff; } - + r[352*i+11*j+ 0] = t[0] & 0xff; r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3); @@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) { int i; uint16_t t0, t1; + poly_reduce(a); poly_reduce(a); for (i = 0; i < KYBER_N / 2; i++) {