diff --git a/benchmarks.csv b/benchmarks.csv
index ddfa372d..4bc2981a 100644
--- a/benchmarks.csv
+++ b/benchmarks.csv
@@ -8,15 +8,15 @@ bikel3 (10 executions),opt,248083316,248083286,248083345,16405238,16405236,16405
hqc-128 (10 executions),clean,52705201,52705180,52705224,105650897,105650877,105650927,159569179,159569176,159569183
hqc-192 (10 executions),clean,161458617,161458590,161458638,323146261,323146250,323146292,486156251,486156214,486156266
hqc-256 (10 executions),clean,295934078,295934057,295934104,591853870,591853850,591853898,891163005,891162988,891163038
-ml-kem-1024 (10 executions),clean,1540607,1535678,1547404,1712361,1707431,1719164,2024616,2019674,2031410
-ml-kem-1024 (10 executions),m4fspeed,1007759,1004549,1017159,1030702,1027443,1040101,1093153,1089895,1102552
-ml-kem-1024 (10 executions),m4fstack,1010100,1007459,1018999,1038190,1035550,1047090,1101234,1098594,1110134
-ml-kem-512 (10 executions),clean,595882,595636,596329,700689,700440,701134,888740,888494,889189
-ml-kem-512 (10 executions),m4fspeed,388544,387190,398814,392178,390825,402445,429456,428102,439722
-ml-kem-512 (10 executions),m4fstack,386999,386677,387465,392813,392492,393280,430152,429831,430619
-ml-kem-768 (10 executions),clean,990901,986123,997663,1140412,1135630,1147169,1390176,1385399,1396936
-ml-kem-768 (10 executions),m4fspeed,631949,631658,632164,656369,656082,656596,705436,705148,705663
-ml-kem-768 (10 executions),m4fstack,634227,632761,644635,662468,661003,672877,711996,710530,722405
+ml-kem-1024 (10 executions),clean,1536343,1535750,1536698,1708071,1707476,1708427,2020327,2019721,2020672
+ml-kem-1024 (10 executions),m4fspeed,1018976,1014877,1026934,1031565,1027454,1039544,1094008,1089897,1101987
+ml-kem-1024 (10 executions),m4fstack,1020202,1017478,1029553,1037953,1035260,1047298,1100982,1098251,1110327
+ml-kem-512 (10 executions),clean,595793,595576,595971,700605,700383,700779,888653,888436,888831
+ml-kem-512 (10 executions),m4fspeed,392423,392211,392614,390881,390671,391073,428167,427955,428357
+ml-kem-512 (10 executions),m4fstack,392224,391772,392541,392864,392407,393181,430202,429745,430519
+ml-kem-768 (10 executions),clean,988722,985880,998135,1138225,1135419,1147634,1387984,1385144,1397397
+ml-kem-768 (10 executions),m4fspeed,642096,639116,651103,658754,655785,667769,707827,704858,716842
+ml-kem-768 (10 executions),m4fstack,644195,640433,652374,664654,660893,672834,714194,710433,722374
Signature Schemes,,,,,,,,,,
Scheme,Implementation,Key Generation [cycles] (mean),Key Generation [cycles] (min),Key Generation [cycles] (max),Sign [cycles] (mean),Sign [cycles] (min),Sign [cycles] (max),Verify [cycles] (mean),Verify [cycles] (min),Verify [cycles] (max)
aimer128f (10 executions),m4speed,490087,490087,490088,28590420,28590395,28590439,26750578,26749771,26751014
@@ -315,15 +315,15 @@ bikel3,opt,0.0,6.3,0.2,,,,,,
hqc-128,clean,0.4,0.8,0.5,,,,,,
hqc-192,clean,0.3,0.5,0.3,,,,,,
hqc-256,clean,0.2,0.4,0.3,,,,,,
-ml-kem-1024,clean,50.0,45.6,38.6,,,,,,
-ml-kem-1024,m4fspeed,76.1,75.5,71.2,,,,,,
-ml-kem-1024,m4fstack,75.9,74.9,70.7,,,,,,
-ml-kem-512,clean,49.9,41.1,32.5,,,,,,
-ml-kem-512,m4fspeed,76.5,73.5,67.2,,,,,,
-ml-kem-512,m4fstack,76.6,73.2,66.9,,,,,,
-ml-kem-768,clean,48.6,43.2,35.4,,,,,,
-ml-kem-768,m4fspeed,75.5,74.4,69.3,,,,,,
-ml-kem-768,m4fstack,75.3,73.9,68.7,,,,,,
+ml-kem-1024,clean,50.0,45.7,38.6,,,,,,
+ml-kem-1024,m4fspeed,75.3,75.5,71.2,,,,,,
+ml-kem-1024,m4fstack,75.1,74.9,70.7,,,,,,
+ml-kem-512,clean,49.8,41.1,32.5,,,,,,
+ml-kem-512,m4fspeed,75.5,73.5,67.1,,,,,,
+ml-kem-512,m4fstack,75.6,73.2,66.9,,,,,,
+ml-kem-768,clean,48.5,43.2,35.4,,,,,,
+ml-kem-768,m4fspeed,74.5,74.4,69.2,,,,,,
+ml-kem-768,m4fstack,74.4,73.9,68.7,,,,,,
Signature Schemes,,,,,,,,,,
Scheme,Implementation,Key Generation [%],Sign [%],Verify [%],,,,,,
aimer128f,m4speed,57.9,49.5,50.1,,,,,,
@@ -469,14 +469,14 @@ hqc-128,clean,18628,0,0,18628,,,,,
hqc-192,clean,21104,0,0,21104,,,,,
hqc-256,clean,26260,0,0,26260,,,,,
ml-kem-1024,clean,6160,0,0,6160,,,,,
-ml-kem-1024,m4fspeed,16912,0,0,16912,,,,,
-ml-kem-1024,m4fstack,14012,0,0,14012,,,,,
+ml-kem-1024,m4fspeed,16916,0,0,16916,,,,,
+ml-kem-1024,m4fstack,14016,0,0,14016,,,,,
ml-kem-512,clean,5116,0,0,5116,,,,,
-ml-kem-512,m4fspeed,15844,0,0,15844,,,,,
-ml-kem-512,m4fstack,13324,0,0,13324,,,,,
+ml-kem-512,m4fspeed,15848,0,0,15848,,,,,
+ml-kem-512,m4fstack,13328,0,0,13328,,,,,
ml-kem-768,clean,5120,0,0,5120,,,,,
-ml-kem-768,m4fspeed,16012,0,0,16012,,,,,
-ml-kem-768,m4fstack,13316,0,0,13316,,,,,
+ml-kem-768,m4fspeed,16016,0,0,16016,,,,,
+ml-kem-768,m4fstack,13320,0,0,13320,,,,,
Signature Schemes,,,,,,,,,,
Scheme,Implementation,.text [bytes],.data [bytes],.bss [bytes],Total [bytes],,,,,
aimer128f,m4speed,15992,0,0,15992,,,,,
diff --git a/benchmarks.md b/benchmarks.md
index e131b348..afe0ae3a 100644
--- a/benchmarks.md
+++ b/benchmarks.md
@@ -9,15 +9,15 @@
| hqc-128 (10 executions) | clean | AVG: 52,705,201
MIN: 52,705,180
MAX: 52,705,224 | AVG: 105,650,897
MIN: 105,650,877
MAX: 105,650,927 | AVG: 159,569,179
MIN: 159,569,176
MAX: 159,569,183 |
| hqc-192 (10 executions) | clean | AVG: 161,458,617
MIN: 161,458,590
MAX: 161,458,638 | AVG: 323,146,261
MIN: 323,146,250
MAX: 323,146,292 | AVG: 486,156,251
MIN: 486,156,214
MAX: 486,156,266 |
| hqc-256 (10 executions) | clean | AVG: 295,934,078
MIN: 295,934,057
MAX: 295,934,104 | AVG: 591,853,870
MIN: 591,853,850
MAX: 591,853,898 | AVG: 891,163,005
MIN: 891,162,988
MAX: 891,163,038 |
-| ml-kem-1024 (10 executions) | clean | AVG: 1,540,607
MIN: 1,535,678
MAX: 1,547,404 | AVG: 1,712,361
MIN: 1,707,431
MAX: 1,719,164 | AVG: 2,024,616
MIN: 2,019,674
MAX: 2,031,410 |
-| ml-kem-1024 (10 executions) | m4fspeed | AVG: 1,007,759
MIN: 1,004,549
MAX: 1,017,159 | AVG: 1,030,702
MIN: 1,027,443
MAX: 1,040,101 | AVG: 1,093,153
MIN: 1,089,895
MAX: 1,102,552 |
-| ml-kem-1024 (10 executions) | m4fstack | AVG: 1,010,100
MIN: 1,007,459
MAX: 1,018,999 | AVG: 1,038,190
MIN: 1,035,550
MAX: 1,047,090 | AVG: 1,101,234
MIN: 1,098,594
MAX: 1,110,134 |
-| ml-kem-512 (10 executions) | clean | AVG: 595,882
MIN: 595,636
MAX: 596,329 | AVG: 700,689
MIN: 700,440
MAX: 701,134 | AVG: 888,740
MIN: 888,494
MAX: 889,189 |
-| ml-kem-512 (10 executions) | m4fspeed | AVG: 388,544
MIN: 387,190
MAX: 398,814 | AVG: 392,178
MIN: 390,825
MAX: 402,445 | AVG: 429,456
MIN: 428,102
MAX: 439,722 |
-| ml-kem-512 (10 executions) | m4fstack | AVG: 386,999
MIN: 386,677
MAX: 387,465 | AVG: 392,813
MIN: 392,492
MAX: 393,280 | AVG: 430,152
MIN: 429,831
MAX: 430,619 |
-| ml-kem-768 (10 executions) | clean | AVG: 990,901
MIN: 986,123
MAX: 997,663 | AVG: 1,140,412
MIN: 1,135,630
MAX: 1,147,169 | AVG: 1,390,176
MIN: 1,385,399
MAX: 1,396,936 |
-| ml-kem-768 (10 executions) | m4fspeed | AVG: 631,949
MIN: 631,658
MAX: 632,164 | AVG: 656,369
MIN: 656,082
MAX: 656,596 | AVG: 705,436
MIN: 705,148
MAX: 705,663 |
-| ml-kem-768 (10 executions) | m4fstack | AVG: 634,227
MIN: 632,761
MAX: 644,635 | AVG: 662,468
MIN: 661,003
MAX: 672,877 | AVG: 711,996
MIN: 710,530
MAX: 722,405 |
+| ml-kem-1024 (10 executions) | clean | AVG: 1,536,343
MIN: 1,535,750
MAX: 1,536,698 | AVG: 1,708,071
MIN: 1,707,476
MAX: 1,708,427 | AVG: 2,020,327
MIN: 2,019,721
MAX: 2,020,672 |
+| ml-kem-1024 (10 executions) | m4fspeed | AVG: 1,018,976
MIN: 1,014,877
MAX: 1,026,934 | AVG: 1,031,565
MIN: 1,027,454
MAX: 1,039,544 | AVG: 1,094,008
MIN: 1,089,897
MAX: 1,101,987 |
+| ml-kem-1024 (10 executions) | m4fstack | AVG: 1,020,202
MIN: 1,017,478
MAX: 1,029,553 | AVG: 1,037,953
MIN: 1,035,260
MAX: 1,047,298 | AVG: 1,100,982
MIN: 1,098,251
MAX: 1,110,327 |
+| ml-kem-512 (10 executions) | clean | AVG: 595,793
MIN: 595,576
MAX: 595,971 | AVG: 700,605
MIN: 700,383
MAX: 700,779 | AVG: 888,653
MIN: 888,436
MAX: 888,831 |
+| ml-kem-512 (10 executions) | m4fspeed | AVG: 392,423
MIN: 392,211
MAX: 392,614 | AVG: 390,881
MIN: 390,671
MAX: 391,073 | AVG: 428,167
MIN: 427,955
MAX: 428,357 |
+| ml-kem-512 (10 executions) | m4fstack | AVG: 392,224
MIN: 391,772
MAX: 392,541 | AVG: 392,864
MIN: 392,407
MAX: 393,181 | AVG: 430,202
MIN: 429,745
MAX: 430,519 |
+| ml-kem-768 (10 executions) | clean | AVG: 988,722
MIN: 985,880
MAX: 998,135 | AVG: 1,138,225
MIN: 1,135,419
MAX: 1,147,634 | AVG: 1,387,984
MIN: 1,385,144
MAX: 1,397,397 |
+| ml-kem-768 (10 executions) | m4fspeed | AVG: 642,096
MIN: 639,116
MAX: 651,103 | AVG: 658,754
MIN: 655,785
MAX: 667,769 | AVG: 707,827
MIN: 704,858
MAX: 716,842 |
+| ml-kem-768 (10 executions) | m4fstack | AVG: 644,195
MIN: 640,433
MAX: 652,374 | AVG: 664,654
MIN: 660,893
MAX: 672,834 | AVG: 714,194
MIN: 710,433
MAX: 722,374 |
## Signature Schemes
| scheme | implementation | key generation [cycles] | sign [cycles] | verify [cycles] |
| ------ | -------------- | ----------------------- | ------------- | --------------- |
@@ -319,15 +319,15 @@
| hqc-128 | clean | 0.4% | 0.8% | 0.5% |
| hqc-192 | clean | 0.3% | 0.5% | 0.3% |
| hqc-256 | clean | 0.2% | 0.4% | 0.3% |
-| ml-kem-1024 | clean | 50.0% | 45.6% | 38.6% |
-| ml-kem-1024 | m4fspeed | 76.1% | 75.5% | 71.2% |
-| ml-kem-1024 | m4fstack | 75.9% | 74.9% | 70.7% |
-| ml-kem-512 | clean | 49.9% | 41.1% | 32.5% |
-| ml-kem-512 | m4fspeed | 76.5% | 73.5% | 67.2% |
-| ml-kem-512 | m4fstack | 76.6% | 73.2% | 66.9% |
-| ml-kem-768 | clean | 48.6% | 43.2% | 35.4% |
-| ml-kem-768 | m4fspeed | 75.5% | 74.4% | 69.3% |
-| ml-kem-768 | m4fstack | 75.3% | 73.9% | 68.7% |
+| ml-kem-1024 | clean | 50.0% | 45.7% | 38.6% |
+| ml-kem-1024 | m4fspeed | 75.3% | 75.5% | 71.2% |
+| ml-kem-1024 | m4fstack | 75.1% | 74.9% | 70.7% |
+| ml-kem-512 | clean | 49.8% | 41.1% | 32.5% |
+| ml-kem-512 | m4fspeed | 75.5% | 73.5% | 67.1% |
+| ml-kem-512 | m4fstack | 75.6% | 73.2% | 66.9% |
+| ml-kem-768 | clean | 48.5% | 43.2% | 35.4% |
+| ml-kem-768 | m4fspeed | 74.5% | 74.4% | 69.2% |
+| ml-kem-768 | m4fstack | 74.4% | 73.9% | 68.7% |
## Signature Schemes
| Scheme | Implementation | Key Generation [%] | Sign [%] | Verify [%] |
| ------ | -------------- | ------------------ | -------- | ---------- |
@@ -475,14 +475,14 @@
| hqc-192 | clean | 21,104 | 0 | 0 | 21,104 |
| hqc-256 | clean | 26,260 | 0 | 0 | 26,260 |
| ml-kem-1024 | clean | 6,160 | 0 | 0 | 6,160 |
-| ml-kem-1024 | m4fspeed | 16,912 | 0 | 0 | 16,912 |
-| ml-kem-1024 | m4fstack | 14,012 | 0 | 0 | 14,012 |
+| ml-kem-1024 | m4fspeed | 16,916 | 0 | 0 | 16,916 |
+| ml-kem-1024 | m4fstack | 14,016 | 0 | 0 | 14,016 |
| ml-kem-512 | clean | 5,116 | 0 | 0 | 5,116 |
-| ml-kem-512 | m4fspeed | 15,844 | 0 | 0 | 15,844 |
-| ml-kem-512 | m4fstack | 13,324 | 0 | 0 | 13,324 |
+| ml-kem-512 | m4fspeed | 15,848 | 0 | 0 | 15,848 |
+| ml-kem-512 | m4fstack | 13,328 | 0 | 0 | 13,328 |
| ml-kem-768 | clean | 5,120 | 0 | 0 | 5,120 |
-| ml-kem-768 | m4fspeed | 16,012 | 0 | 0 | 16,012 |
-| ml-kem-768 | m4fstack | 13,316 | 0 | 0 | 13,316 |
+| ml-kem-768 | m4fspeed | 16,016 | 0 | 0 | 16,016 |
+| ml-kem-768 | m4fstack | 13,320 | 0 | 0 | 13,320 |
## Signature Schemes
| Scheme | Implementation | .text [bytes] | .data [bytes] | .bss [bytes] | Total [bytes] |
| ------ | -------------- | ------------- | ------------- | ------------ | ------------- |
diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.c b/crypto_kem/ml-kem-512/m4fspeed/poly.c
index 401b26b7..5a6d0abb 100644
--- a/crypto_kem/ml-kem-512/m4fspeed/poly.c
+++ b/crypto_kem/ml-kem-512/m4fspeed/poly.c
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
int i;
uint16_t t0, t1;
+ poly_reduce(a);
poly_reduce(a);
for (i = 0; i < KYBER_N / 2; i++) {
diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.c b/crypto_kem/ml-kem-512/m4fstack/poly.c
index 443fdbae..ed54ec87 100644
--- a/crypto_kem/ml-kem-512/m4fstack/poly.c
+++ b/crypto_kem/ml-kem-512/m4fstack/poly.c
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
int i;
uint16_t t0, t1;
+ poly_reduce(a);
poly_reduce(a);
for (i = 0; i < KYBER_N / 2; i++) {
diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.c b/crypto_kem/ml-kem-768/m4fspeed/poly.c
index b52060f9..b909a85c 100644
--- a/crypto_kem/ml-kem-768/m4fspeed/poly.c
+++ b/crypto_kem/ml-kem-768/m4fspeed/poly.c
@@ -142,7 +142,7 @@ void poly_packcompress(unsigned char *r, poly *a, int i) {
d0 >>= 31;
t[k] = d0 & 0x7ff;
}
-
+
r[352*i+11*j+ 0] = t[0] & 0xff;
r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3);
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
int i;
uint16_t t0, t1;
+ poly_reduce(a);
poly_reduce(a);
for (i = 0; i < KYBER_N / 2; i++) {
@@ -465,7 +466,7 @@ void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add
* Using strategy of better accumulation (initial step).
* Arguments: - const poly *a: pointer to input polynomial
* - const poly *b: pointer to input polynomial
-* - const poly *a_prime: pointer to a pre-multiplied by zetas
+* - const poly *a_prime: pointer to a pre-multiplied by zetas
* - int32_t *r_tmp: array for accumulating unreduced results
**************************************************/
extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
@@ -481,7 +482,7 @@ void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const
* Using strategy of better accumulation.
* Arguments: - const poly *a: pointer to input polynomial
* - const poly *b: pointer to input polynomial
-* - const poly *a_prime: pointer to a pre-multiplied by zetas
+* - const poly *a_prime: pointer to a pre-multiplied by zetas
* - int32_t *r_tmp: array for accumulating unreduced results
**************************************************/
extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
@@ -497,7 +498,7 @@ void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, co
* Using strategy of better accumulation (final step).
* Arguments: - const poly *a: pointer to input polynomial
* - const poly *b: pointer to input polynomial
-* - const poly *a_prime: pointer to a pre-multiplied by zetas
+* - const poly *a_prime: pointer to a pre-multiplied by zetas
* - poly *r: pointer to output polynomial
* - int32_t *r_tmp: array for accumulating unreduced results
**************************************************/
diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.c b/crypto_kem/ml-kem-768/m4fstack/poly.c
index 35475adb..fb13d155 100644
--- a/crypto_kem/ml-kem-768/m4fstack/poly.c
+++ b/crypto_kem/ml-kem-768/m4fstack/poly.c
@@ -142,7 +142,7 @@ void poly_packcompress(unsigned char *r, poly *a, int i) {
d0 >>= 31;
t[k] = d0 & 0x7ff;
}
-
+
r[352*i+11*j+ 0] = t[0] & 0xff;
r[352*i+11*j+ 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3);
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
int i;
uint16_t t0, t1;
+ poly_reduce(a);
poly_reduce(a);
for (i = 0; i < KYBER_N / 2; i++) {
diff --git a/mk/stm32f4discovery.mk b/mk/stm32f4discovery.mk
index f69ee7db..4be2c2cf 100644
--- a/mk/stm32f4discovery.mk
+++ b/mk/stm32f4discovery.mk
@@ -25,7 +25,7 @@ elf/boardtest-fast.elf: CPPFLAGS+=-DSRAM_TIMING_TEST -DHAS_SRAM2 -DHAS_CCM
elf/crypto_kem_frodokem640aes_m4_%.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld
elf/mupq_pqclean_crypto_kem_frodokem640shake_opt_%.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld
-elf/crypto_sign_dilithium5_m4f_%.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld
+elf/crypto_sign_ml-dsa-87_m4f_%.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld
elf/crypto_sign_perk-256-%_m4_testvectors.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld
elf/crypto_sign_perk-256-fast%_m4_test.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld
elf/crypto_sign_haetae5_m4f_%.elf: LDSCRIPT=ldscripts/stm32f4discovery_fullram.ld