diff --git a/wolfcrypt/src/sp_arm32.c b/wolfcrypt/src/sp_arm32.c
index 13fe73a0f7..8a2f2c1781 100644
--- a/wolfcrypt/src/sp_arm32.c
+++ b/wolfcrypt/src/sp_arm32.c
@@ -70518,7 +70518,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_8(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 8, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 8, 0, sizeof(sp_digit) * 8U);
     sp_256_mont_reduce_8(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_8(r->x, p256_mod);
@@ -70527,7 +70527,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_8(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 8, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 8, 0, sizeof(sp_digit) * 8U);
     sp_256_mont_reduce_8(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_8(r->y, p256_mod);
@@ -70536,7 +70536,6 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -70839,6 +70838,61 @@ static void sp_256_div2_8(sp_digit* r_p, const sp_digit* a_p, const sp_digit* m_
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*8;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_8(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_8(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_8(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_8(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_8(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_8(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_8(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_8(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_8(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_8(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_8(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_8(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_8(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_8(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_8(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_8_ctx {
     int state;
@@ -70849,6 +70903,12 @@ typedef struct sp_256_proj_point_dbl_8_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_8_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -70973,62 +71033,6 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*8;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_8(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_8(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_8(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_8(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_8(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_8(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_8(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_8(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_8(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_8(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_8(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_8(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_8(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_8(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_8(y, y, t2, p256_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -71054,6 +71058,7 @@ static int sp_256_iszero_8(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -71061,6 +71066,81 @@ static int sp_256_iszero_8(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_8(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*8;
+    sp_digit* t3 = t + 4*8;
+    sp_digit* t4 = t + 6*8;
+    sp_digit* t5 = t + 8*8;
+    sp_digit* t6 = t + 10*8;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_8(t2, t1) &
+            sp_256_cmp_equal_8(t4, t3)) {
+        sp_256_proj_point_dbl_8(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_8(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_8(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(x, x, t5, p256_mod);
+        sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_8(t3, y, p256_mod);
+        sp_256_mont_sub_8(x, x, t3, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_8(y, y, x, p256_mod);
+        sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(y, y, t5, p256_mod);
+        for (i = 0; i < 8; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 8; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 8; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_8_ctx {
@@ -71079,6 +71159,13 @@ typedef struct sp_256_proj_point_add_8_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_8_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -71110,252 +71197,149 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_8(ctx->t1, p256_mod, q->y);
-        sp_256_norm_8(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-            (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_8(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_8_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_8(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_8(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_8(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_8(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_8(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_8(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_8(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_8(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_8(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_8(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_8(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_8(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_8(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_8(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 8; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 8; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 8; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_8(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*8;
-    sp_digit* t3 = t + 4*8;
-    sp_digit* t4 = t + 6*8;
-    sp_digit* t5 = t + 8*8;
-    sp_digit* t6 = t + 10*8;
-
-
-    /* Check double */
-    (void)sp_256_sub_8(t1, p256_mod, q->y);
-    sp_256_norm_8(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-        (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_8(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_8(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_8(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_8(t2) & sp_256_iszero_8(t4) & maskt) {
-            sp_256_proj_point_dbl_8(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_8(x, x, t5, p256_mod);
-            sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_dbl_8(t3, y, p256_mod);
-            sp_256_mont_sub_8(x, x, t3, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_8(y, y, x, p256_mod);
-            sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_8(y, y, t5, p256_mod);
-
-            for (i = 0; i < 8; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 8; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 8; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -71643,7 +71627,6 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_8(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_8(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -71661,7 +71644,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
         sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_8(t2, b, p256_mod);
         sp_256_mont_sub_8(x, x, t2, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_8(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_8(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -71691,7 +71674,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_dbl_8(t2, b, p256_mod);
     sp_256_mont_sub_8(x, x, t2, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_sub_lower_8(t2, b, x, p256_mod);
     sp_256_mont_dbl_lower_8(b, t2, p256_mod);
     /* Z = Z*Y */
@@ -71701,7 +71684,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_8(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_8(y, y, p256_mod);
 }
@@ -71746,8 +71729,8 @@ typedef struct sp_table_entry_256 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_8(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*8;
@@ -71756,12 +71739,17 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*8;
     sp_digit* t6 = t + 10*8;
 
-    /* Check double */
-    (void)sp_256_sub_8(t1, p256_mod, q->y);
-    sp_256_norm_8(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-        (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_8(p->x, t2) &
+            sp_256_cmp_equal_8(p->y, t4)) {
         sp_256_proj_point_dbl_8(r, p, t);
     }
     else {
@@ -71773,12 +71761,6 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_8(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -75856,7 +75838,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_8(r);
 
-            if (sp_256_iszero_8(r) == 0) {
+            if (!sp_256_iszero_8(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 8, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -75865,7 +75847,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_8(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_8(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_8(s))) {
                     break;
                 }
             }
@@ -87956,7 +87938,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_12(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 12, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 12, 0, sizeof(sp_digit) * 12U);
     sp_384_mont_reduce_12(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_12(r->x, p384_mod);
@@ -87965,7 +87947,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_12(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 12, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 12, 0, sizeof(sp_digit) * 12U);
     sp_384_mont_reduce_12(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_12(r->y, p384_mod);
@@ -87974,7 +87956,6 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -88247,6 +88228,61 @@ static void sp_384_div2_12(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*12;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_12(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_12(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_12(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_12(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_12(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_12(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_12(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_12(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_12(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_12(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_12(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_12(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_12(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_12(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_12(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_12_ctx {
     int state;
@@ -88257,6 +88293,12 @@ typedef struct sp_384_proj_point_dbl_12_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_12_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -88381,62 +88423,6 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*12;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_12(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_12(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_12(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_12(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_12(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_12(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_12(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_12(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_12(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_12(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_12(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_12(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_12(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_12(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_12(y, y, t2, p384_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -88464,6 +88450,7 @@ static int sp_384_iszero_12(const sp_digit* a)
             a[8] | a[9] | a[10] | a[11]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -88471,6 +88458,81 @@ static int sp_384_iszero_12(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_12(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*12;
+    sp_digit* t3 = t + 4*12;
+    sp_digit* t4 = t + 6*12;
+    sp_digit* t5 = t + 8*12;
+    sp_digit* t6 = t + 10*12;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_12(t2, t1) &
+            sp_384_cmp_equal_12(t4, t3)) {
+        sp_384_proj_point_dbl_12(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_12(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_12(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(x, x, t5, p384_mod);
+        sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_12(t3, y, p384_mod);
+        sp_384_mont_sub_12(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_12(y, y, x, p384_mod);
+        sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(y, y, t5, p384_mod);
+        for (i = 0; i < 12; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 12; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 12; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_12_ctx {
@@ -88489,6 +88551,13 @@ typedef struct sp_384_proj_point_add_12_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_12_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -88520,252 +88589,149 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_12(ctx->t1, p384_mod, q->y);
-        sp_384_norm_12(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-            (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_12(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_12_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_12(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_12(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_12(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_12(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_12(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_12(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_12(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_12(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_12(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_12(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_12(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_12(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_12(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_12(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 12; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 12; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 12; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_384_proj_point_add_12(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*12;
-    sp_digit* t3 = t + 4*12;
-    sp_digit* t4 = t + 6*12;
-    sp_digit* t5 = t + 8*12;
-    sp_digit* t6 = t + 10*12;
-
-
-    /* Check double */
-    (void)sp_384_sub_12(t1, p384_mod, q->y);
-    sp_384_norm_12(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-        (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_12(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_12(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_12(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_12(t2) & sp_384_iszero_12(t4) & maskt) {
-            sp_384_proj_point_dbl_12(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_12(x, x, t5, p384_mod);
-            sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_12(t3, y, p384_mod);
-            sp_384_mont_sub_12(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_12(y, y, x, p384_mod);
-            sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_12(y, y, t5, p384_mod);
-
-            for (i = 0; i < 12; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 12; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 12; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -89077,7 +89043,6 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_12(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_12(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -89095,7 +89060,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
         sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_12(t2, b, p384_mod);
         sp_384_mont_sub_12(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_12(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_12(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -89125,7 +89090,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_12(t2, b, p384_mod);
     sp_384_mont_sub_12(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_12(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_12(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -89135,7 +89100,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_12(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_12(y, y, p384_mod);
 }
@@ -89180,8 +89145,8 @@ typedef struct sp_table_entry_384 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_12(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*12;
@@ -89190,12 +89155,17 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*12;
     sp_digit* t6 = t + 10*12;
 
-    /* Check double */
-    (void)sp_384_sub_12(t1, p384_mod, q->y);
-    sp_384_norm_12(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-        (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_12(p->x, t2) &
+            sp_384_cmp_equal_12(p->y, t4)) {
         sp_384_proj_point_dbl_12(r, p, t);
     }
     else {
@@ -89207,12 +89177,6 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_12(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -93470,7 +93434,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_12(r);
 
-            if (sp_384_iszero_12(r) == 0) {
+            if (!sp_384_iszero_12(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 12, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -93479,7 +93443,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_12(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_12(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_12(s))) {
                     break;
                 }
             }
@@ -114610,7 +114574,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_17(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 17, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 17, 0, sizeof(sp_digit) * 17U);
     sp_521_mont_reduce_17(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_17(r->x, p521_mod);
@@ -114619,7 +114583,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_17(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 17, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 17, 0, sizeof(sp_digit) * 17U);
     sp_521_mont_reduce_17(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_17(r->y, p521_mod);
@@ -114628,7 +114592,6 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -115079,6 +115042,61 @@ static void sp_521_div2_17(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*17;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_17(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_17(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_17(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_17(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_17(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_17(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_17(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_17(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_17(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_17(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_17(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_17(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_17(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_17(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_17(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_17_ctx {
     int state;
@@ -115089,6 +115107,12 @@ typedef struct sp_521_proj_point_dbl_17_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_17_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -115213,62 +115237,6 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*17;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_17(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_17(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_17(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_17(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_17(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_17(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_17(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_17(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_17(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_17(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_17(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_17(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_17(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_17(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_17(y, y, t2, p521_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -115299,6 +115267,7 @@ static int sp_521_iszero_17(const sp_digit* a)
             a[16]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -115306,6 +115275,81 @@ static int sp_521_iszero_17(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_17(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*17;
+    sp_digit* t3 = t + 4*17;
+    sp_digit* t4 = t + 6*17;
+    sp_digit* t5 = t + 8*17;
+    sp_digit* t6 = t + 10*17;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_17(t2, t1) &
+            sp_521_cmp_equal_17(t4, t3)) {
+        sp_521_proj_point_dbl_17(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_17(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_17(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(x, x, t5, p521_mod);
+        sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_17(t3, y, p521_mod);
+        sp_521_mont_sub_17(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_17(y, y, x, p521_mod);
+        sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(y, y, t5, p521_mod);
+        for (i = 0; i < 17; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 17; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 17; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_17_ctx {
@@ -115324,6 +115368,13 @@ typedef struct sp_521_proj_point_add_17_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_17_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -115355,252 +115406,149 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_17(ctx->t1, p521_mod, q->y);
-        sp_521_norm_17(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-            (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_17(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_17_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_17(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_17(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_17(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_17(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_17(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_17(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_17(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_17(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_17(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_17(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_17(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_17(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_17(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_17(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 17; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 17; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 17; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_17(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*17;
-    sp_digit* t3 = t + 4*17;
-    sp_digit* t4 = t + 6*17;
-    sp_digit* t5 = t + 8*17;
-    sp_digit* t6 = t + 10*17;
-
-
-    /* Check double */
-    (void)sp_521_sub_17(t1, p521_mod, q->y);
-    sp_521_norm_17(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-        (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_17(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_17(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_17(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_17(t2) & sp_521_iszero_17(t4) & maskt) {
-            sp_521_proj_point_dbl_17(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_17(x, x, t5, p521_mod);
-            sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_17(t3, y, p521_mod);
-            sp_521_mont_sub_17(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_17(y, y, x, p521_mod);
-            sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_17(y, y, t5, p521_mod);
-
-            for (i = 0; i < 17; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 17; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 17; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -115946,7 +115894,6 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_17(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_17(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -115964,7 +115911,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
         sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_17(t2, b, p521_mod);
         sp_521_mont_sub_17(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_17(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_17(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -115994,7 +115941,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_17(t2, b, p521_mod);
     sp_521_mont_sub_17(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_17(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_17(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -116004,7 +115951,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_17(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_17(y, y, p521_mod);
 }
@@ -116049,8 +115996,8 @@ typedef struct sp_table_entry_521 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_17(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*17;
@@ -116059,12 +116006,17 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*17;
     sp_digit* t6 = t + 10*17;
 
-    /* Check double */
-    (void)sp_521_sub_17(t1, p521_mod, q->y);
-    sp_521_norm_17(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-        (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_17(p->x, t2) &
+            sp_521_cmp_equal_17(p->y, t4)) {
         sp_521_proj_point_dbl_17(r, p, t);
     }
     else {
@@ -116076,12 +116028,6 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_17(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -121612,7 +121558,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_17(r);
 
-            if (sp_521_iszero_17(r) == 0) {
+            if (!sp_521_iszero_17(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 17, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -121626,7 +121572,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_17(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_17(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_17(s))) {
                     break;
                 }
             }
@@ -143695,7 +143641,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_32(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 32, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 32, 0, sizeof(sp_digit) * 32U);
     sp_1024_mont_reduce_32(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_32(r->x, p1024_mod);
@@ -143704,7 +143650,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_32(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 32, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 32, 0, sizeof(sp_digit) * 32U);
     sp_1024_mont_reduce_32(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_32(r->y, p1024_mod);
@@ -143713,7 +143659,6 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -144860,6 +144805,61 @@ static void sp_1024_div2_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*32;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_32(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_32(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_32(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_32(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_32(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_32(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_32(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_32(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_32(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_32(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_32(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_32(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_32(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_32(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_32_ctx {
     int state;
@@ -144870,6 +144870,12 @@ typedef struct sp_1024_proj_point_dbl_32_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_32_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -144994,62 +145000,6 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*32;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_32(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_32(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_32(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_32(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_32(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_32(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_32(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_32(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_32(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_32(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_32(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_32(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_32(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_32(y, y, t2, p1024_mod);
-}
-
 #ifdef WOLFSSL_SP_SMALL
 /* Sub b from a into r. (r = a - b)
  *
@@ -145202,6 +145152,7 @@ static int sp_1024_iszero_32(const sp_digit* a)
             a[24] | a[25] | a[26] | a[27] | a[28] | a[29] | a[30] | a[31]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -145209,6 +145160,81 @@ static int sp_1024_iszero_32(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_32(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*32;
+    sp_digit* t3 = t + 4*32;
+    sp_digit* t4 = t + 6*32;
+    sp_digit* t5 = t + 8*32;
+    sp_digit* t6 = t + 10*32;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_32(t2, t1) &
+            sp_1024_cmp_equal_32(t4, t3)) {
+        sp_1024_proj_point_dbl_32(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_32(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_32(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_32(t3, y, p1024_mod);
+        sp_1024_mont_sub_32(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
+        sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(y, y, t5, p1024_mod);
+        for (i = 0; i < 32; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 32; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 32; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_32_ctx {
@@ -145227,6 +145253,13 @@ typedef struct sp_1024_proj_point_add_32_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_32_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -145258,252 +145291,149 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_32(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_32(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-            (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_32(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_32_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_32(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_32(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_32(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_32(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_32(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_32(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_32(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_32(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_32(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_32(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_32(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_32(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 32; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 32; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 32; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_32(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*32;
-    sp_digit* t3 = t + 4*32;
-    sp_digit* t4 = t + 6*32;
-    sp_digit* t5 = t + 8*32;
-    sp_digit* t6 = t + 10*32;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_32(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-        (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_32(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_32(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_32(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_32(t2) & sp_1024_iszero_32(t4) & maskt) {
-            sp_1024_proj_point_dbl_32(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_32(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_32(t3, y, p1024_mod);
-            sp_1024_mont_sub_32(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
-            sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_32(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 32; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 32; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 32; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply the point by the scalar and return the result.
  * If map is true then convert result to affine coordinates.
  *
@@ -145679,7 +145609,6 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_32(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_32(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -145697,7 +145626,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
         sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_32(t2, b, p1024_mod);
         sp_1024_mont_sub_32(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_32(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -145727,7 +145656,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_32(t2, b, p1024_mod);
     sp_1024_mont_sub_32(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_32(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -145737,7 +145666,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_32(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_32(y, y, p1024_mod);
 }
@@ -145782,8 +145711,8 @@ typedef struct sp_table_entry_1024 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*32;
@@ -145792,12 +145721,17 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*32;
     sp_digit* t6 = t + 10*32;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_32(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-        (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_32(p->x, t2) &
+            sp_1024_cmp_equal_32(p->y, t4)) {
         sp_1024_proj_point_dbl_32(r, p, t);
     }
     else {
@@ -145809,12 +145743,6 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
diff --git a/wolfcrypt/src/sp_arm64.c b/wolfcrypt/src/sp_arm64.c
index 97d121d49c..6a33f1c892 100644
--- a/wolfcrypt/src/sp_arm64.c
+++ b/wolfcrypt/src/sp_arm64.c
@@ -23227,7 +23227,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_4(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 4, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 4, 0, sizeof(sp_digit) * 4U);
     sp_256_mont_reduce_4(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_4(r->x, p256_mod);
@@ -23236,7 +23236,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_4(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 4, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 4, 0, sizeof(sp_digit) * 4U);
     sp_256_mont_reduce_4(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_4(r->y, p256_mod);
@@ -23245,7 +23245,6 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -23437,6 +23436,61 @@ static void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*4;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_4(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_4(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_4(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_4(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_4(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_4(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_4(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_4(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_4(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_4(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_4(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_4(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_4(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_4(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_4_ctx {
     int state;
@@ -23447,6 +23501,12 @@ typedef struct sp_256_proj_point_dbl_4_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_4_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -23571,62 +23631,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*4;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_4(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_4(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_4(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_4(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_4(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_4(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_4(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_4(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_4(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_4(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_4(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_4(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_4(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_4(y, y, t2, p256_mod);
-}
-
 #define sp_256_mont_tpl_lower_4 sp_256_mont_tpl_4
 /* Subtract two Montgomery form numbers (r = a - b % m).
  *
@@ -23750,7 +23754,6 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_4(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_4(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -23767,7 +23770,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
         /* X = A^2 - 2B */
         sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
         /* Z = Z*Y */
         sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
@@ -23795,7 +23798,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     /* X = A^2 - 2B */
     sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
     /* Z = Z*Y */
     sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
@@ -23804,7 +23807,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_4(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_4(y, y, p256_mod);
 }
@@ -23833,6 +23836,7 @@ static int sp_256_iszero_4(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -23840,6 +23844,80 @@ static int sp_256_iszero_4(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_4(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*4;
+    sp_digit* t3 = t + 4*4;
+    sp_digit* t4 = t + 6*4;
+    sp_digit* t5 = t + 8*4;
+    sp_digit* t6 = t + 10*4;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_4(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_4(t2, t1) &
+            sp_256_cmp_equal_4(t4, t3)) {
+        sp_256_proj_point_dbl_4(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_4(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_4(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_4(x, x, t5, p256_mod);
+        sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_4(y, y, x, p256_mod);
+        sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_4(y, y, t5, p256_mod);
+        for (i = 0; i < 4; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 4; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 4; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_4_ctx {
@@ -23858,6 +23936,13 @@ typedef struct sp_256_proj_point_add_4_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_4_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -23889,251 +23974,148 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_4(ctx->t1, p256_mod, q->y);
-        sp_256_norm_4(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-            (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_4(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_4_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_4(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_4(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_4(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_4(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_4(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_4(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_4(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_4(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_4(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_4(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_4(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_4(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_4(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_4(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_4(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_4(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_4(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_4(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 4; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 4; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 4; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_4(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*4;
-    sp_digit* t3 = t + 4*4;
-    sp_digit* t4 = t + 6*4;
-    sp_digit* t5 = t + 8*4;
-    sp_digit* t6 = t + 10*4;
-
-
-    /* Check double */
-    (void)sp_256_sub_4(t1, p256_mod, q->y);
-    sp_256_norm_4(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_4(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_4(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_4(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_4(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_4(t2) & sp_256_iszero_4(t4) & maskt) {
-            sp_256_proj_point_dbl_4(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_4(x, x, t5, p256_mod);
-            sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_4(y, y, x, p256_mod);
-            sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_4(y, y, t5, p256_mod);
-
-            for (i = 0; i < 4; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 4; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 4; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -24184,7 +24166,7 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
         /* X = A^2 - 2B */
         sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
         /* Z = Z*Y */
         sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
@@ -24198,7 +24180,6 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_4(y, y, t1, p256_mod);
-
         /* Y = Y/2 */
         sp_256_div2_4(r[j].y, y, p256_mod);
         r[j].infinity = 0;
@@ -24560,8 +24541,8 @@ typedef struct sp_table_entry_256 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*4;
@@ -24570,12 +24551,17 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*4;
     sp_digit* t6 = t + 10*4;
 
-    /* Check double */
-    (void)sp_256_sub_4(t1, p256_mod, q->y);
-    sp_256_norm_4(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_4(p->x, t2) &
+            sp_256_cmp_equal_4(p->y, t4)) {
         sp_256_proj_point_dbl_4(r, p, t);
     }
     else {
@@ -24587,12 +24573,6 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_4(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -40604,7 +40584,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_4(r);
 
-            if (sp_256_iszero_4(r) == 0) {
+            if (!sp_256_iszero_4(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 4, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -40613,7 +40593,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_4(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_4(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_4(s))) {
                     break;
                 }
             }
@@ -43547,7 +43527,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_6(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 6, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 6, 0, sizeof(sp_digit) * 6U);
     sp_384_mont_reduce_6(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_6(r->x, p384_mod);
@@ -43556,7 +43536,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_6(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 6, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 6, 0, sizeof(sp_digit) * 6U);
     sp_384_mont_reduce_6(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_6(r->y, p384_mod);
@@ -43565,7 +43545,6 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -43754,6 +43733,61 @@ static void sp_384_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*6;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_6(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_6(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_6(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_6(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_6(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_6(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_6(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_6(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_6(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_6(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_6(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_6(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_6(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_6(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_6(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_6(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_6_ctx {
     int state;
@@ -43764,6 +43798,12 @@ typedef struct sp_384_proj_point_dbl_6_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_6_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -43888,62 +43928,6 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*6;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_6(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_6(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_6(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_6(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_6(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_6(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_6(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_6(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_6(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_6(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_6(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_6(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_6(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_6(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_6(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_6(y, y, t2, p384_mod);
-}
-
 #define sp_384_mont_dbl_lower_6 sp_384_mont_dbl_6
 #define sp_384_mont_tpl_lower_6 sp_384_mont_tpl_6
 /* Double the Montgomery form projective point p a number of times.
@@ -43975,7 +43959,6 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_6(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_6(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -43993,7 +43976,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
         sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_6(t2, b, p384_mod);
         sp_384_mont_sub_6(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_6(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_6(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -44023,7 +44006,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_6(t2, b, p384_mod);
     sp_384_mont_sub_6(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_6(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_6(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -44033,7 +44016,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_6(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_6(y, y, p384_mod);
 }
@@ -44062,6 +44045,7 @@ static int sp_384_iszero_6(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -44069,6 +44053,81 @@ static int sp_384_iszero_6(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_6(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*6;
+    sp_digit* t3 = t + 4*6;
+    sp_digit* t4 = t + 6*6;
+    sp_digit* t5 = t + 8*6;
+    sp_digit* t6 = t + 10*6;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_6(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_6(t2, t1) &
+            sp_384_cmp_equal_6(t4, t3)) {
+        sp_384_proj_point_dbl_6(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_6(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_6(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_6(x, x, t5, p384_mod);
+        sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_6(t3, y, p384_mod);
+        sp_384_mont_sub_6(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_6(y, y, x, p384_mod);
+        sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_6(y, y, t5, p384_mod);
+        for (i = 0; i < 6; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 6; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 6; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_6_ctx {
@@ -44087,6 +44146,13 @@ typedef struct sp_384_proj_point_add_6_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_6_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -44118,252 +44184,149 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_6(ctx->t1, p384_mod, q->y);
-        sp_384_norm_6(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-            (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_6(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_6_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_6(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_6(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_6(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_6(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_6(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_6(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_6(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_6(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_6(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_6(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_6(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_6(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_6(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_6(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_6(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_6(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_6(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_6(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_6(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 6; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 6; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 6; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_384_proj_point_add_6(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*6;
-    sp_digit* t3 = t + 4*6;
-    sp_digit* t4 = t + 6*6;
-    sp_digit* t5 = t + 8*6;
-    sp_digit* t6 = t + 10*6;
-
-
-    /* Check double */
-    (void)sp_384_sub_6(t1, p384_mod, q->y);
-    sp_384_norm_6(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-        (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_6(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_6(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_6(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_6(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_6(t2) & sp_384_iszero_6(t4) & maskt) {
-            sp_384_proj_point_dbl_6(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_6(x, x, t5, p384_mod);
-            sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_6(t3, y, p384_mod);
-            sp_384_mont_sub_6(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_6(y, y, x, p384_mod);
-            sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_6(y, y, t5, p384_mod);
-
-            for (i = 0; i < 6; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 6; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 6; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -44416,7 +44379,7 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r,
         sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_6(t2, b, p384_mod);
         sp_384_mont_sub_6(x, x, t2, p384_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_6(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_6(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -44431,7 +44394,6 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_6(y, y, t1, p384_mod);
-
         /* Y = Y/2 */
         sp_384_div2_6(r[j].y, y, p384_mod);
         r[j].infinity = 0;
@@ -44805,8 +44767,8 @@ typedef struct sp_table_entry_384 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_6(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*6;
@@ -44815,12 +44777,17 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*6;
     sp_digit* t6 = t + 10*6;
 
-    /* Check double */
-    (void)sp_384_sub_6(t1, p384_mod, q->y);
-    sp_384_norm_6(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-        (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_6(p->x, t2) &
+            sp_384_cmp_equal_6(p->y, t4)) {
         sp_384_proj_point_dbl_6(r, p, t);
     }
     else {
@@ -44832,12 +44799,6 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_6(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -66728,7 +66689,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_6(r);
 
-            if (sp_384_iszero_6(r) == 0) {
+            if (!sp_384_iszero_6(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 6, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -66737,7 +66698,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_6(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_6(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_6(s))) {
                     break;
                 }
             }
@@ -71694,7 +71655,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_9(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 9, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_9(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_9(r->x, p521_mod);
@@ -71703,7 +71664,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_9(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 9, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_9(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_9(r->y, p521_mod);
@@ -71712,7 +71673,6 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -72069,6 +72029,61 @@ static void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_9(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_9(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_9(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_9(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_9(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_9(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_9(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_9(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_9(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_9(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_9(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_9(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_9(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_9(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_9(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_9_ctx {
     int state;
@@ -72079,6 +72094,12 @@ typedef struct sp_521_proj_point_dbl_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_9_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -72203,62 +72224,6 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_9(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_9(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_9(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_9(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_9(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_9(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_9(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_9(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_9(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_9(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_9(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_9(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_9(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_9(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_9(y, y, t2, p521_mod);
-}
-
 #define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9
 #define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9
 /* Double the Montgomery form projective point p a number of times.
@@ -72290,7 +72255,6 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_9(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_9(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -72308,7 +72272,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
         sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_9(t2, b, p521_mod);
         sp_521_mont_sub_9(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -72338,7 +72302,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_9(t2, b, p521_mod);
     sp_521_mont_sub_9(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_9(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -72348,7 +72312,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_9(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_9(y, y, p521_mod);
 }
@@ -72379,6 +72343,7 @@ static int sp_521_iszero_9(const sp_digit* a)
             a[8]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -72386,6 +72351,81 @@ static int sp_521_iszero_9(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_9(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* t6 = t + 10*9;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(t2, t1) &
+            sp_521_cmp_equal_9(t4, t3)) {
+        sp_521_proj_point_dbl_9(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_9(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_9(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(x, x, t5, p521_mod);
+        sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_9(t3, y, p521_mod);
+        sp_521_mont_sub_9(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_9(y, y, x, p521_mod);
+        sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(y, y, t5, p521_mod);
+        for (i = 0; i < 9; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_9_ctx {
@@ -72404,6 +72444,13 @@ typedef struct sp_521_proj_point_add_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_9_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -72435,252 +72482,149 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_9(ctx->t1, p521_mod, q->y);
-        sp_521_norm_9(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-            (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_9(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_9(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_9(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_9(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_9(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 9; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_9(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* t3 = t + 4*9;
-    sp_digit* t4 = t + 6*9;
-    sp_digit* t5 = t + 8*9;
-    sp_digit* t6 = t + 10*9;
-
-
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_9(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_9(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_9(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_9(t2) & sp_521_iszero_9(t4) & maskt) {
-            sp_521_proj_point_dbl_9(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_9(x, x, t5, p521_mod);
-            sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_9(t3, y, p521_mod);
-            sp_521_mont_sub_9(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_9(y, y, x, p521_mod);
-            sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_9(y, y, t5, p521_mod);
-
-            for (i = 0; i < 9; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -72733,7 +72677,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_9(t2, b, p521_mod);
         sp_521_mont_sub_9(x, x, t2, p521_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -72748,7 +72692,6 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_9(y, y, t1, p521_mod);
-
         /* Y = Y/2 */
         sp_521_div2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
@@ -73140,8 +73083,8 @@ typedef struct sp_table_entry_521 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_9(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*9;
@@ -73150,12 +73093,17 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*9;
     sp_digit* t6 = t + 10*9;
 
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(p->x, t2) &
+            sp_521_cmp_equal_9(p->y, t4)) {
         sp_521_proj_point_dbl_9(r, p, t);
     }
     else {
@@ -73167,12 +73115,6 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_9(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -111550,7 +111492,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_9(r);
 
-            if (sp_521_iszero_9(r) == 0) {
+            if (!sp_521_iszero_9(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 9, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -111564,7 +111506,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_9(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_9(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_9(s))) {
                     break;
                 }
             }
@@ -115415,7 +115357,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_16(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 16, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 16, 0, sizeof(sp_digit) * 16U);
     sp_1024_mont_reduce_16(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_16(r->x, p1024_mod);
@@ -115424,7 +115366,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_16(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 16, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 16, 0, sizeof(sp_digit) * 16U);
     sp_1024_mont_reduce_16(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_16(r->y, p1024_mod);
@@ -115433,7 +115375,6 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -116083,6 +116024,61 @@ static void sp_1024_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*16;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_16(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_16(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_16(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_16(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_16(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_16(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_16(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_16(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_16(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_16(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_16(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_16(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_16(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_16(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_16(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_16_ctx {
     int state;
@@ -116093,6 +116089,12 @@ typedef struct sp_1024_proj_point_dbl_16_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_16_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -116217,62 +116219,6 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*16;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_16(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_16(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_16(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_16(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_16(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_16(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_16(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_16(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_16(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_16(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_16(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_16(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_16(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_16(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_16(y, y, t2, p1024_mod);
-}
-
 #define sp_1024_mont_dbl_lower_16 sp_1024_mont_dbl_16
 #define sp_1024_mont_tpl_lower_16 sp_1024_mont_tpl_16
 /* Double the Montgomery form projective point p a number of times.
@@ -116304,7 +116250,6 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_16(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_16(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -116322,7 +116267,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
         sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_16(t2, b, p1024_mod);
         sp_1024_mont_sub_16(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_16(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -116352,7 +116297,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_16(t2, b, p1024_mod);
     sp_1024_mont_sub_16(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_16(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -116362,7 +116307,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_16(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_16(y, y, p1024_mod);
 }
@@ -116494,6 +116439,7 @@ static int sp_1024_iszero_16(const sp_digit* a)
             a[8] | a[9] | a[10] | a[11] | a[12] | a[13] | a[14] | a[15]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -116501,6 +116447,81 @@ static int sp_1024_iszero_16(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_16(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*16;
+    sp_digit* t3 = t + 4*16;
+    sp_digit* t4 = t + 6*16;
+    sp_digit* t5 = t + 8*16;
+    sp_digit* t6 = t + 10*16;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_16(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_16(t2, t1) &
+            sp_1024_cmp_equal_16(t4, t3)) {
+        sp_1024_proj_point_dbl_16(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_16(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_16(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_16(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_16(t3, y, p1024_mod);
+        sp_1024_mont_sub_16(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
+        sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_16(y, y, t5, p1024_mod);
+        for (i = 0; i < 16; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 16; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 16; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_16_ctx {
@@ -116519,6 +116540,13 @@ typedef struct sp_1024_proj_point_add_16_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_16_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -116550,252 +116578,149 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_16(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_16(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-            (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_16(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_16_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_16(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_16(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_16(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_16(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_16(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_16(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_16(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_16(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_16(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_16(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_16(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_16(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_16(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_16(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_16(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 16; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 16; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 16; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_16(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*16;
-    sp_digit* t3 = t + 4*16;
-    sp_digit* t4 = t + 6*16;
-    sp_digit* t5 = t + 8*16;
-    sp_digit* t6 = t + 10*16;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_16(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-        (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_16(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_16(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_16(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_16(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_16(t2) & sp_1024_iszero_16(t4) & maskt) {
-            sp_1024_proj_point_dbl_16(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_16(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_16(t3, y, p1024_mod);
-            sp_1024_mont_sub_16(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
-            sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_16(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 16; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 16; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 16; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -116848,7 +116773,7 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r,
         sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_16(t2, b, p1024_mod);
         sp_1024_mont_sub_16(x, x, t2, p1024_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_16(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -116863,7 +116788,6 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_16(y, y, t1, p1024_mod);
-
         /* Y = Y/2 */
         sp_1024_div2_16(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
@@ -117194,8 +117118,8 @@ typedef struct sp_table_entry_1024 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*16;
@@ -117204,12 +117128,17 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*16;
     sp_digit* t6 = t + 10*16;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_16(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-        (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_16(p->x, t2) &
+            sp_1024_cmp_equal_16(p->y, t4)) {
         sp_1024_proj_point_dbl_16(r, p, t);
     }
     else {
@@ -117221,12 +117150,6 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_16(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
diff --git a/wolfcrypt/src/sp_armthumb.c b/wolfcrypt/src/sp_armthumb.c
index 6395b7845c..959b37806d 100644
--- a/wolfcrypt/src/sp_armthumb.c
+++ b/wolfcrypt/src/sp_armthumb.c
@@ -99287,7 +99287,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_8(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 8, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 8, 0, sizeof(sp_digit) * 8U);
     sp_256_mont_reduce_8(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_8(r->x, p256_mod);
@@ -99296,7 +99296,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_8(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 8, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 8, 0, sizeof(sp_digit) * 8U);
     sp_256_mont_reduce_8(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_8(r->y, p256_mod);
@@ -99305,7 +99305,6 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -100463,6 +100462,61 @@ SP_NOINLINE static void sp_256_div2_8(sp_digit* r, const sp_digit* a,
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*8;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_8(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_8(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_8(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_8(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_8(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_8(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_8(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_8(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_8(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_8(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_8(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_8(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_8(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_8(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_8(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_8_ctx {
     int state;
@@ -100473,6 +100527,12 @@ typedef struct sp_256_proj_point_dbl_8_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_8_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -100597,62 +100657,6 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*8;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_8(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_8(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_8(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_8(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_8(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_8(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_8(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_8(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_8(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_8(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_8(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_8(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_8(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_8(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_8(y, y, t2, p256_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -100678,6 +100682,7 @@ static int sp_256_iszero_8(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -100685,6 +100690,81 @@ static int sp_256_iszero_8(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_8(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*8;
+    sp_digit* t3 = t + 4*8;
+    sp_digit* t4 = t + 6*8;
+    sp_digit* t5 = t + 8*8;
+    sp_digit* t6 = t + 10*8;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_8(t2, t1) &
+            sp_256_cmp_equal_8(t4, t3)) {
+        sp_256_proj_point_dbl_8(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_8(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_8(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(x, x, t5, p256_mod);
+        sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_8(t3, y, p256_mod);
+        sp_256_mont_sub_8(x, x, t3, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_8(y, y, x, p256_mod);
+        sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(y, y, t5, p256_mod);
+        for (i = 0; i < 8; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 8; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 8; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_8_ctx {
@@ -100703,6 +100783,13 @@ typedef struct sp_256_proj_point_add_8_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_8_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -100734,252 +100821,149 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_8(ctx->t1, p256_mod, q->y);
-        sp_256_norm_8(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-            (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_8(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_8_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_8(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_8(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_8(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_8(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_8(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_8(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_8(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_8(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_8(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_8(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_8(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_8(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_8(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_8(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 8; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 8; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 8; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_8(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*8;
-    sp_digit* t3 = t + 4*8;
-    sp_digit* t4 = t + 6*8;
-    sp_digit* t5 = t + 8*8;
-    sp_digit* t6 = t + 10*8;
-
-
-    /* Check double */
-    (void)sp_256_sub_8(t1, p256_mod, q->y);
-    sp_256_norm_8(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-        (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_8(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_8(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_8(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_8(t2) & sp_256_iszero_8(t4) & maskt) {
-            sp_256_proj_point_dbl_8(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_8(x, x, t5, p256_mod);
-            sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_dbl_8(t3, y, p256_mod);
-            sp_256_mont_sub_8(x, x, t3, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_8(y, y, x, p256_mod);
-            sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_8(y, y, t5, p256_mod);
-
-            for (i = 0; i < 8; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 8; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 8; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -101267,7 +101251,6 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_8(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_8(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -101285,7 +101268,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
         sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_8(t2, b, p256_mod);
         sp_256_mont_sub_8(x, x, t2, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_8(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_8(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -101315,7 +101298,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_dbl_8(t2, b, p256_mod);
     sp_256_mont_sub_8(x, x, t2, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_sub_lower_8(t2, b, x, p256_mod);
     sp_256_mont_dbl_lower_8(b, t2, p256_mod);
     /* Z = Z*Y */
@@ -101325,7 +101308,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_8(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_8(y, y, p256_mod);
 }
@@ -101370,8 +101353,8 @@ typedef struct sp_table_entry_256 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_8(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*8;
@@ -101380,12 +101363,17 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*8;
     sp_digit* t6 = t + 10*8;
 
-    /* Check double */
-    (void)sp_256_sub_8(t1, p256_mod, q->y);
-    sp_256_norm_8(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-        (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_8(p->x, t2) &
+            sp_256_cmp_equal_8(p->y, t4)) {
         sp_256_proj_point_dbl_8(r, p, t);
     }
     else {
@@ -101397,12 +101385,6 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_8(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -105816,7 +105798,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_8(r);
 
-            if (sp_256_iszero_8(r) == 0) {
+            if (!sp_256_iszero_8(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 8, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -105825,7 +105807,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_8(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_8(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_8(s))) {
                     break;
                 }
             }
@@ -110724,7 +110706,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_12(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 12, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 12, 0, sizeof(sp_digit) * 12U);
     sp_384_mont_reduce_12(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_12(r->x, p384_mod);
@@ -110733,7 +110715,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_12(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 12, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 12, 0, sizeof(sp_digit) * 12U);
     sp_384_mont_reduce_12(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_12(r->y, p384_mod);
@@ -110742,7 +110724,6 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -111131,6 +111112,61 @@ SP_NOINLINE static void sp_384_div2_12(sp_digit* r, const sp_digit* a,
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*12;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_12(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_12(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_12(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_12(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_12(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_12(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_12(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_12(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_12(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_12(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_12(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_12(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_12(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_12(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_12(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_12_ctx {
     int state;
@@ -111141,6 +111177,12 @@ typedef struct sp_384_proj_point_dbl_12_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_12_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -111265,62 +111307,6 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*12;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_12(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_12(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_12(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_12(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_12(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_12(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_12(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_12(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_12(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_12(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_12(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_12(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_12(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_12(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_12(y, y, t2, p384_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -111348,6 +111334,7 @@ static int sp_384_iszero_12(const sp_digit* a)
             a[8] | a[9] | a[10] | a[11]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -111355,6 +111342,81 @@ static int sp_384_iszero_12(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_12(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*12;
+    sp_digit* t3 = t + 4*12;
+    sp_digit* t4 = t + 6*12;
+    sp_digit* t5 = t + 8*12;
+    sp_digit* t6 = t + 10*12;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_12(t2, t1) &
+            sp_384_cmp_equal_12(t4, t3)) {
+        sp_384_proj_point_dbl_12(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_12(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_12(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(x, x, t5, p384_mod);
+        sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_12(t3, y, p384_mod);
+        sp_384_mont_sub_12(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_12(y, y, x, p384_mod);
+        sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(y, y, t5, p384_mod);
+        for (i = 0; i < 12; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 12; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 12; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_12_ctx {
@@ -111373,6 +111435,13 @@ typedef struct sp_384_proj_point_add_12_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_12_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -111404,252 +111473,149 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_12(ctx->t1, p384_mod, q->y);
-        sp_384_norm_12(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-            (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_12(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_12_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_12(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_12(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_12(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_12(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_12(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_12(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_12(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_12(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_12(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_12(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_12(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_12(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_12(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_12(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 12; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 12; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 12; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_384_proj_point_add_12(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*12;
-    sp_digit* t3 = t + 4*12;
-    sp_digit* t4 = t + 6*12;
-    sp_digit* t5 = t + 8*12;
-    sp_digit* t6 = t + 10*12;
-
-
-    /* Check double */
-    (void)sp_384_sub_12(t1, p384_mod, q->y);
-    sp_384_norm_12(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-        (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_12(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_12(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_12(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_12(t2) & sp_384_iszero_12(t4) & maskt) {
-            sp_384_proj_point_dbl_12(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_12(x, x, t5, p384_mod);
-            sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_12(t3, y, p384_mod);
-            sp_384_mont_sub_12(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_12(y, y, x, p384_mod);
-            sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_12(y, y, t5, p384_mod);
-
-            for (i = 0; i < 12; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 12; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 12; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -111961,7 +111927,6 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_12(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_12(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -111979,7 +111944,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
         sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_12(t2, b, p384_mod);
         sp_384_mont_sub_12(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_12(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_12(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -112009,7 +111974,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_12(t2, b, p384_mod);
     sp_384_mont_sub_12(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_12(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_12(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -112019,7 +111984,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_12(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_12(y, y, p384_mod);
 }
@@ -112064,8 +112029,8 @@ typedef struct sp_table_entry_384 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_12(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*12;
@@ -112074,12 +112039,17 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*12;
     sp_digit* t6 = t + 10*12;
 
-    /* Check double */
-    (void)sp_384_sub_12(t1, p384_mod, q->y);
-    sp_384_norm_12(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-        (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_12(p->x, t2) &
+            sp_384_cmp_equal_12(p->y, t4)) {
         sp_384_proj_point_dbl_12(r, p, t);
     }
     else {
@@ -112091,12 +112061,6 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_12(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -116589,7 +116553,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_12(r);
 
-            if (sp_384_iszero_12(r) == 0) {
+            if (!sp_384_iszero_12(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 12, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -116598,7 +116562,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_12(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_12(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_12(s))) {
                     break;
                 }
             }
@@ -123055,7 +123019,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_17(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 17, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 17, 0, sizeof(sp_digit) * 17U);
     sp_521_mont_reduce_17(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_17(r->x, p521_mod);
@@ -123064,7 +123028,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_17(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 17, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 17, 0, sizeof(sp_digit) * 17U);
     sp_521_mont_reduce_17(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_17(r->y, p521_mod);
@@ -123073,7 +123037,6 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -124961,6 +124924,61 @@ SP_NOINLINE static void sp_521_div2_17(sp_digit* r, const sp_digit* a,
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*17;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_17(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_17(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_17(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_17(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_17(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_17(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_17(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_17(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_17(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_17(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_17(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_17(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_17(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_17(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_17(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_17_ctx {
     int state;
@@ -124971,6 +124989,12 @@ typedef struct sp_521_proj_point_dbl_17_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_17_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -125095,62 +125119,6 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*17;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_17(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_17(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_17(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_17(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_17(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_17(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_17(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_17(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_17(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_17(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_17(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_17(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_17(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_17(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_17(y, y, t2, p521_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -125181,6 +125149,7 @@ static int sp_521_iszero_17(const sp_digit* a)
             a[16]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -125188,6 +125157,81 @@ static int sp_521_iszero_17(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_17(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*17;
+    sp_digit* t3 = t + 4*17;
+    sp_digit* t4 = t + 6*17;
+    sp_digit* t5 = t + 8*17;
+    sp_digit* t6 = t + 10*17;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_17(t2, t1) &
+            sp_521_cmp_equal_17(t4, t3)) {
+        sp_521_proj_point_dbl_17(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_17(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_17(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(x, x, t5, p521_mod);
+        sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_17(t3, y, p521_mod);
+        sp_521_mont_sub_17(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_17(y, y, x, p521_mod);
+        sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(y, y, t5, p521_mod);
+        for (i = 0; i < 17; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 17; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 17; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_17_ctx {
@@ -125206,6 +125250,13 @@ typedef struct sp_521_proj_point_add_17_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_17_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -125237,252 +125288,149 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_17(ctx->t1, p521_mod, q->y);
-        sp_521_norm_17(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-            (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_17(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_17_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_17(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_17(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_17(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_17(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_17(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_17(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_17(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_17(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_17(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_17(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_17(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_17(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_17(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_17(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 17; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 17; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 17; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_17(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*17;
-    sp_digit* t3 = t + 4*17;
-    sp_digit* t4 = t + 6*17;
-    sp_digit* t5 = t + 8*17;
-    sp_digit* t6 = t + 10*17;
-
-
-    /* Check double */
-    (void)sp_521_sub_17(t1, p521_mod, q->y);
-    sp_521_norm_17(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-        (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_17(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_17(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_17(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_17(t2) & sp_521_iszero_17(t4) & maskt) {
-            sp_521_proj_point_dbl_17(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_17(x, x, t5, p521_mod);
-            sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_17(t3, y, p521_mod);
-            sp_521_mont_sub_17(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_17(y, y, x, p521_mod);
-            sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_17(y, y, t5, p521_mod);
-
-            for (i = 0; i < 17; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 17; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 17; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -125828,7 +125776,6 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_17(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_17(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -125846,7 +125793,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
         sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_17(t2, b, p521_mod);
         sp_521_mont_sub_17(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_17(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_17(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -125876,7 +125823,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_17(t2, b, p521_mod);
     sp_521_mont_sub_17(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_17(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_17(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -125886,7 +125833,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_17(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_17(y, y, p521_mod);
 }
@@ -125931,8 +125878,8 @@ typedef struct sp_table_entry_521 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_17(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*17;
@@ -125941,12 +125888,17 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*17;
     sp_digit* t6 = t + 10*17;
 
-    /* Check double */
-    (void)sp_521_sub_17(t1, p521_mod, q->y);
-    sp_521_norm_17(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-        (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_17(p->x, t2) &
+            sp_521_cmp_equal_17(p->y, t4)) {
         sp_521_proj_point_dbl_17(r, p, t);
     }
     else {
@@ -125958,12 +125910,6 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_17(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -132792,7 +132738,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_17(r);
 
-            if (sp_521_iszero_17(r) == 0) {
+            if (!sp_521_iszero_17(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 17, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -132806,7 +132752,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_17(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_17(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_17(s))) {
                     break;
                 }
             }
@@ -203894,7 +203840,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_32(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 32, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 32, 0, sizeof(sp_digit) * 32U);
     sp_1024_mont_reduce_32(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_32(r->x, p1024_mod);
@@ -203903,7 +203849,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_32(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 32, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 32, 0, sizeof(sp_digit) * 32U);
     sp_1024_mont_reduce_32(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_32(r->y, p1024_mod);
@@ -203912,7 +203858,6 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -209608,6 +209553,61 @@ SP_NOINLINE static void sp_1024_div2_32(sp_digit* r, const sp_digit* a,
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*32;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_32(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_32(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_32(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_32(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_32(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_32(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_32(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_32(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_32(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_32(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_32(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_32(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_32(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_32(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_32_ctx {
     int state;
@@ -209618,6 +209618,12 @@ typedef struct sp_1024_proj_point_dbl_32_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_32_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -209742,62 +209748,6 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*32;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_32(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_32(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_32(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_32(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_32(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_32(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_32(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_32(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_32(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_32(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_32(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_32(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_32(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_32(y, y, t2, p1024_mod);
-}
-
 #ifdef WOLFSSL_SP_SMALL
 /* Sub b from a into r. (r = a - b)
  *
@@ -210198,6 +210148,7 @@ static int sp_1024_iszero_32(const sp_digit* a)
             a[24] | a[25] | a[26] | a[27] | a[28] | a[29] | a[30] | a[31]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -210205,6 +210156,81 @@ static int sp_1024_iszero_32(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_32(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*32;
+    sp_digit* t3 = t + 4*32;
+    sp_digit* t4 = t + 6*32;
+    sp_digit* t5 = t + 8*32;
+    sp_digit* t6 = t + 10*32;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_32(t2, t1) &
+            sp_1024_cmp_equal_32(t4, t3)) {
+        sp_1024_proj_point_dbl_32(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_32(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_32(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_32(t3, y, p1024_mod);
+        sp_1024_mont_sub_32(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
+        sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(y, y, t5, p1024_mod);
+        for (i = 0; i < 32; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 32; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 32; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_32_ctx {
@@ -210223,6 +210249,13 @@ typedef struct sp_1024_proj_point_add_32_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_32_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -210254,252 +210287,149 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_32(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_32(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-            (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_32(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_32_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_32(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_32(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_32(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_32(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_32(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_32(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_32(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_32(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_32(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_32(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_32(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_32(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 32; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 32; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 32; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_32(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*32;
-    sp_digit* t3 = t + 4*32;
-    sp_digit* t4 = t + 6*32;
-    sp_digit* t5 = t + 8*32;
-    sp_digit* t6 = t + 10*32;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_32(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-        (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_32(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_32(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_32(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_32(t2) & sp_1024_iszero_32(t4) & maskt) {
-            sp_1024_proj_point_dbl_32(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_32(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_32(t3, y, p1024_mod);
-            sp_1024_mont_sub_32(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
-            sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_32(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 32; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 32; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 32; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply the point by the scalar and return the result.
  * If map is true then convert result to affine coordinates.
  *
@@ -210675,7 +210605,6 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_32(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_32(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -210693,7 +210622,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
         sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_32(t2, b, p1024_mod);
         sp_1024_mont_sub_32(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_32(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -210723,7 +210652,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_32(t2, b, p1024_mod);
     sp_1024_mont_sub_32(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_32(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -210733,7 +210662,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_32(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_32(y, y, p1024_mod);
 }
@@ -210778,8 +210707,8 @@ typedef struct sp_table_entry_1024 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*32;
@@ -210788,12 +210717,17 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*32;
     sp_digit* t6 = t + 10*32;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_32(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-        (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_32(p->x, t2) &
+            sp_1024_cmp_equal_32(p->y, t4)) {
         sp_1024_proj_point_dbl_32(r, p, t);
     }
     else {
@@ -210805,12 +210739,6 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
diff --git a/wolfcrypt/src/sp_c32.c b/wolfcrypt/src/sp_c32.c
index 74b0501052..e6bdcfb529 100644
--- a/wolfcrypt/src/sp_c32.c
+++ b/wolfcrypt/src/sp_c32.c
@@ -21076,7 +21076,7 @@ static void sp_256_map_9(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_9(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 9, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 9, 0, sizeof(sp_digit) * 9U);
     sp_256_mont_reduce_9(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_9(r->x, p256_mod);
@@ -21085,7 +21085,7 @@ static void sp_256_map_9(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_9(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 9, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 9, 0, sizeof(sp_digit) * 9U);
     sp_256_mont_reduce_9(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_9(r->y, p256_mod);
@@ -21094,7 +21094,6 @@ static void sp_256_map_9(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -21259,6 +21258,61 @@ static void sp_256_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_9(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_9(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_9(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_9(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_9(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_9(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_9(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_9(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_9(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_9(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_9(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_9(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_9(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_9(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_9(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_9(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_9(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_9_ctx {
     int state;
@@ -21269,6 +21323,12 @@ typedef struct sp_256_proj_point_dbl_9_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_9_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -21393,62 +21453,6 @@ static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_9(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_9(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_9(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_9(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_9(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_9(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_9(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_9(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_9(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_9(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_9(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_9(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_9(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_9(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_9(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_9(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_9(y, y, t2, p256_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -21475,6 +21479,7 @@ static int sp_256_iszero_9(const sp_digit* a)
             a[8]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -21482,6 +21487,81 @@ static int sp_256_iszero_9(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_9(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* t6 = t + 10*9;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_9(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_9(t2, t1) &
+            sp_256_cmp_equal_9(t4, t3)) {
+        sp_256_proj_point_dbl_9(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_9(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_9(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(x, x, t5, p256_mod);
+        sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_9(t3, y, p256_mod);
+        sp_256_mont_sub_9(x, x, t3, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_9(y, y, x, p256_mod);
+        sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(y, y, t5, p256_mod);
+        for (i = 0; i < 9; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_9_ctx {
@@ -21500,6 +21580,13 @@ typedef struct sp_256_proj_point_add_9_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_9_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -21531,252 +21618,149 @@ static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_9(ctx->t1, p256_mod, q->y);
-        sp_256_norm_9(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_9(p->x, q->x) & sp_256_cmp_equal_9(p->z, q->z) &
-            (sp_256_cmp_equal_9(p->y, q->y) | sp_256_cmp_equal_9(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_9(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_9(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_9(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_9(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_9(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_9(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_9(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_9(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_9(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_9(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_9(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_9(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_9(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_9(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_9(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_9(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_9(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_9(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_9(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_9(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_9(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_9(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_9(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_9(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_dbl_9(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t3, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_9(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 9; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_9(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* t3 = t + 4*9;
-    sp_digit* t4 = t + 6*9;
-    sp_digit* t5 = t + 8*9;
-    sp_digit* t6 = t + 10*9;
-
-
-    /* Check double */
-    (void)sp_256_sub_9(t1, p256_mod, q->y);
-    sp_256_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_9(p->x, q->x) & sp_256_cmp_equal_9(p->z, q->z) &
-        (sp_256_cmp_equal_9(p->y, q->y) | sp_256_cmp_equal_9(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_9(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_9(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_9(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_9(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_9(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_9(t2) & sp_256_iszero_9(t4) & maskt) {
-            sp_256_proj_point_dbl_9(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_9(x, x, t5, p256_mod);
-            sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_dbl_9(t3, y, p256_mod);
-            sp_256_mont_sub_9(x, x, t3, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_9(y, y, x, p256_mod);
-            sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_9(y, y, t5, p256_mod);
-
-            for (i = 0; i < 9; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
  * r  The resulting Montgomery form number.
@@ -22213,7 +22197,6 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_9(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_9(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -22231,7 +22214,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i,
         sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_9(t2, b, p256_mod);
         sp_256_mont_sub_9(x, x, t2, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_9(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_9(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -22261,7 +22244,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i,
     sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_dbl_9(t2, b, p256_mod);
     sp_256_mont_sub_9(x, x, t2, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_sub_lower_9(t2, b, x, p256_mod);
     sp_256_mont_dbl_lower_9(b, t2, p256_mod);
     /* Z = Z*Y */
@@ -22271,7 +22254,7 @@ static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_9(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_9(y, y, p256_mod);
 }
@@ -22328,7 +22311,7 @@ static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r,
         sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_9(t2, b, p256_mod);
         sp_256_mont_sub_9(x, x, t2, p256_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_9(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_9(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -22343,7 +22326,6 @@ static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_9(y, y, t1, p256_mod);
-
         /* Y = Y/2 */
         sp_256_div2_9(r[j].y, y, p256_mod);
         r[j].infinity = 0;
@@ -22728,8 +22710,8 @@ static int sp_256_ecc_mulmod_win_add_sub_9(sp_point_256* r, const sp_point_256*
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_9(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*9;
@@ -22738,12 +22720,17 @@ static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*9;
     sp_digit* t6 = t + 10*9;
 
-    /* Check double */
-    (void)sp_256_sub_9(t1, p256_mod, q->y);
-    sp_256_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_9(p->x, q->x) & sp_256_cmp_equal_9(p->z, q->z) &
-        (sp_256_cmp_equal_9(p->y, q->y) | sp_256_cmp_equal_9(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_9(p->x, t2) &
+            sp_256_cmp_equal_9(p->y, t4)) {
         sp_256_proj_point_dbl_9(r, p, t);
     }
     else {
@@ -22755,12 +22742,6 @@ static void sp_256_proj_point_add_qz1_9(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_9(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -25796,7 +25777,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_9(r);
 
-            if (sp_256_iszero_9(r) == 0) {
+            if (!sp_256_iszero_9(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 9, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -25805,7 +25786,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_9(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_9(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_9(s))) {
                     break;
                 }
             }
@@ -28502,7 +28483,7 @@ static void sp_384_map_15(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_15(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 15, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 15, 0, sizeof(sp_digit) * 15U);
     sp_384_mont_reduce_15(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_15(r->x, p384_mod);
@@ -28511,7 +28492,7 @@ static void sp_384_map_15(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_15(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 15, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 15, 0, sizeof(sp_digit) * 15U);
     sp_384_mont_reduce_15(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_15(r->y, p384_mod);
@@ -28520,7 +28501,6 @@ static void sp_384_map_15(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -28697,6 +28677,61 @@ static void sp_384_div2_15(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*15;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_15(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_15(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_15(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_15(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_15(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_15(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_15(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_15(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_15(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_15(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_15(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_15(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_15(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_15(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_15(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_15(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_15(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_15(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_15_ctx {
     int state;
@@ -28707,6 +28742,12 @@ typedef struct sp_384_proj_point_dbl_15_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_15_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -28831,62 +28872,6 @@ static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*15;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_15(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_15(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_15(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_15(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_15(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_15(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_15(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_15(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_15(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_15(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_15(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_15(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_15(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_15(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_15(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_15(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_15(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_15(y, y, t2, p384_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -28915,6 +28900,7 @@ static int sp_384_iszero_15(const sp_digit* a)
             a[8] | a[9] | a[10] | a[11] | a[12] | a[13] | a[14]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -28922,6 +28908,81 @@ static int sp_384_iszero_15(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_15(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*15;
+    sp_digit* t3 = t + 4*15;
+    sp_digit* t4 = t + 6*15;
+    sp_digit* t5 = t + 8*15;
+    sp_digit* t6 = t + 10*15;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_15(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_15(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_15(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_15(t2, t1) &
+            sp_384_cmp_equal_15(t4, t3)) {
+        sp_384_proj_point_dbl_15(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_15(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_15(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_15(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_15(x, x, t5, p384_mod);
+        sp_384_mont_mul_15(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_15(t3, y, p384_mod);
+        sp_384_mont_sub_15(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_15(y, y, x, p384_mod);
+        sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_15(y, y, t5, p384_mod);
+        for (i = 0; i < 15; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 15; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 15; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_15_ctx {
@@ -28940,6 +29001,13 @@ typedef struct sp_384_proj_point_add_15_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_15_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -28971,251 +29039,148 @@ static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_15(ctx->t1, p384_mod, q->y);
-        sp_384_norm_15(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_15(p->x, q->x) & sp_384_cmp_equal_15(p->z, q->z) &
-            (sp_384_cmp_equal_15(p->y, q->y) | sp_384_cmp_equal_15(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_15(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_15_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_15(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_15(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_15(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_15(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_15(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_15(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_15(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_15(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_15(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_15(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_15(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_15(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_15(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_15(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_15(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_15(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_15(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_15(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_15(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_15(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_15(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_15(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_15(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_15(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_15(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_15(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_15(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_15(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_15(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_15(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_15(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_15(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_15(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_15(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_15(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_15(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_15(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 15; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 15; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 15; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
-        err = FP_WOULDBLOCK;
-    }
-    return err;
-}
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_add_15(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*15;
-    sp_digit* t3 = t + 4*15;
-    sp_digit* t4 = t + 6*15;
-    sp_digit* t5 = t + 8*15;
-    sp_digit* t6 = t + 10*15;
-
-
-    /* Check double */
-    (void)sp_384_sub_15(t1, p384_mod, q->y);
-    sp_384_norm_15(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_15(p->x, q->x) & sp_384_cmp_equal_15(p->z, q->z) &
-        (sp_384_cmp_equal_15(p->y, q->y) | sp_384_cmp_equal_15(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_15(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_15(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_15(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_15(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_15(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_15(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_15(t2) & sp_384_iszero_15(t4) & maskt) {
-            sp_384_proj_point_dbl_15(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_15(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_15(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_15(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_15(x, x, t5, p384_mod);
-            sp_384_mont_mul_15(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_15(t3, y, p384_mod);
-            sp_384_mont_sub_15(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_15(y, y, x, p384_mod);
-            sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_15(y, y, t5, p384_mod);
-
-            for (i = 0; i < 15; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 15; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 15; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
+    if (err == MP_OKAY && ctx->state != 25) {
+        err = FP_WOULDBLOCK;
     }
+    return err;
 }
+#endif /* WOLFSSL_SP_NONBLOCK */
 
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -29709,7 +29674,6 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_15(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_15(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -29727,7 +29691,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i,
         sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_15(t2, b, p384_mod);
         sp_384_mont_sub_15(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_15(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_15(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -29757,7 +29721,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i,
     sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_15(t2, b, p384_mod);
     sp_384_mont_sub_15(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_15(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_15(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -29767,7 +29731,7 @@ static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_15(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_15(y, y, p384_mod);
 }
@@ -29824,7 +29788,7 @@ static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r,
         sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_15(t2, b, p384_mod);
         sp_384_mont_sub_15(x, x, t2, p384_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_15(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_15(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -29839,7 +29803,6 @@ static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_15(y, y, t1, p384_mod);
-
         /* Y = Y/2 */
         sp_384_div2_15(r[j].y, y, p384_mod);
         r[j].infinity = 0;
@@ -30260,8 +30223,8 @@ static int sp_384_ecc_mulmod_win_add_sub_15(sp_point_384* r, const sp_point_384*
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_15(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_15(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*15;
@@ -30270,12 +30233,17 @@ static void sp_384_proj_point_add_qz1_15(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*15;
     sp_digit* t6 = t + 10*15;
 
-    /* Check double */
-    (void)sp_384_sub_15(t1, p384_mod, q->y);
-    sp_384_norm_15(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_15(p->x, q->x) & sp_384_cmp_equal_15(p->z, q->z) &
-        (sp_384_cmp_equal_15(p->y, q->y) | sp_384_cmp_equal_15(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_15(p->x, t2) &
+            sp_384_cmp_equal_15(p->y, t4)) {
         sp_384_proj_point_dbl_15(r, p, t);
     }
     else {
@@ -30287,12 +30255,6 @@ static void sp_384_proj_point_add_qz1_15(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_15(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -33877,7 +33839,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_15(r);
 
-            if (sp_384_iszero_15(r) == 0) {
+            if (!sp_384_iszero_15(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 15, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -33886,7 +33848,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_15(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_15(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_15(s))) {
                     break;
                 }
             }
@@ -36178,7 +36140,7 @@ static void sp_521_map_21(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_21(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 21, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 21, 0, sizeof(sp_digit) * 21U);
     sp_521_mont_reduce_21(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_21(r->x, p521_mod);
@@ -36187,7 +36149,7 @@ static void sp_521_map_21(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_21(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 21, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 21, 0, sizeof(sp_digit) * 21U);
     sp_521_mont_reduce_21(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_21(r->y, p521_mod);
@@ -36196,7 +36158,6 @@ static void sp_521_map_21(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -36381,6 +36342,61 @@ static void sp_521_div2_21(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*21;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_21(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_21(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_21(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_21(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_21(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_21(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_21(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_21(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_21(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_21(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_21(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_21(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_21(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_21(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_21(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_21(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_21(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_21(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_21_ctx {
     int state;
@@ -36391,6 +36407,12 @@ typedef struct sp_521_proj_point_dbl_21_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_21_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -36515,62 +36537,6 @@ static int sp_521_proj_point_dbl_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_21(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*21;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_21(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_21(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_21(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_21(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_21(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_21(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_21(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_21(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_21(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_21(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_21(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_21(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_21(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_21(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_21(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_21(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_21(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_21(y, y, t2, p521_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -36602,6 +36568,7 @@ static int sp_521_iszero_21(const sp_digit* a)
             a[16] | a[17] | a[18] | a[19] | a[20]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -36609,6 +36576,81 @@ static int sp_521_iszero_21(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_21(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*21;
+    sp_digit* t3 = t + 4*21;
+    sp_digit* t4 = t + 6*21;
+    sp_digit* t5 = t + 8*21;
+    sp_digit* t6 = t + 10*21;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_21(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_21(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_21(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_21(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_21(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_21(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_21(t2, t1) &
+            sp_521_cmp_equal_21(t4, t3)) {
+        sp_521_proj_point_dbl_21(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_21(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_21(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_21(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_21(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_21(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_21(x, x, t5, p521_mod);
+        sp_521_mont_mul_21(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_21(t3, y, p521_mod);
+        sp_521_mont_sub_21(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_21(y, y, x, p521_mod);
+        sp_521_mont_mul_21(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_21(y, y, t5, p521_mod);
+        for (i = 0; i < 21; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 21; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 21; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_21_ctx {
@@ -36627,6 +36669,13 @@ typedef struct sp_521_proj_point_add_21_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_21_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -36658,252 +36707,149 @@ static int sp_521_proj_point_add_21_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_21(ctx->t1, p521_mod, q->y);
-        sp_521_norm_21(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_21(p->x, q->x) & sp_521_cmp_equal_21(p->z, q->z) &
-            (sp_521_cmp_equal_21(p->y, q->y) | sp_521_cmp_equal_21(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_21(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_21_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_21(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_21(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_21(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_21(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_21(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_21(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_21(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_21(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_21(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_21(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_21(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_21(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_21(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_21(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_21(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_21(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_21(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_21(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_21(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_21(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_21(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_21(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_21(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_21(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_21(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_21(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_21(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_21(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_21(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_21(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_21(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_21(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_21(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_21(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_21(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_21(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_21(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_21(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_21(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_21(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_21(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_21(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 21; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 21; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 21; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_21(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*21;
-    sp_digit* t3 = t + 4*21;
-    sp_digit* t4 = t + 6*21;
-    sp_digit* t5 = t + 8*21;
-    sp_digit* t6 = t + 10*21;
-
-
-    /* Check double */
-    (void)sp_521_sub_21(t1, p521_mod, q->y);
-    sp_521_norm_21(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_21(p->x, q->x) & sp_521_cmp_equal_21(p->z, q->z) &
-        (sp_521_cmp_equal_21(p->y, q->y) | sp_521_cmp_equal_21(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_21(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_21(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_21(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_21(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_21(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_21(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_21(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_21(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_21(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_21(t2) & sp_521_iszero_21(t4) & maskt) {
-            sp_521_proj_point_dbl_21(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_21(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_21(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_21(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_21(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_21(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_21(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_21(x, x, t5, p521_mod);
-            sp_521_mont_mul_21(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_21(t3, y, p521_mod);
-            sp_521_mont_sub_21(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_21(y, y, x, p521_mod);
-            sp_521_mont_mul_21(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_21(y, y, t5, p521_mod);
-
-            for (i = 0; i < 21; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 21; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 21; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
  * r  The resulting Montgomery form number.
@@ -37258,7 +37204,6 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_21(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_21(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -37276,7 +37221,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i,
         sp_521_mont_sqr_21(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_21(t2, b, p521_mod);
         sp_521_mont_sub_21(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_21(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_21(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -37306,7 +37251,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i,
     sp_521_mont_sqr_21(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_21(t2, b, p521_mod);
     sp_521_mont_sub_21(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_21(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_21(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -37316,7 +37261,7 @@ static void sp_521_proj_point_dbl_n_21(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_21(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_21(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_21(y, y, p521_mod);
 }
@@ -37373,7 +37318,7 @@ static void sp_521_proj_point_dbl_n_store_21(sp_point_521* r,
         sp_521_mont_sqr_21(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_21(t2, b, p521_mod);
         sp_521_mont_sub_21(x, x, t2, p521_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_21(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_21(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -37388,7 +37333,6 @@ static void sp_521_proj_point_dbl_n_store_21(sp_point_521* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_521_mont_mul_21(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_21(y, y, t1, p521_mod);
-
         /* Y = Y/2 */
         sp_521_div2_21(r[j].y, y, p521_mod);
         r[j].infinity = 0;
@@ -37845,8 +37789,8 @@ static int sp_521_ecc_mulmod_win_add_sub_21(sp_point_521* r, const sp_point_521*
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_21(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_21(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*21;
@@ -37855,12 +37799,17 @@ static void sp_521_proj_point_add_qz1_21(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*21;
     sp_digit* t6 = t + 10*21;
 
-    /* Check double */
-    (void)sp_521_sub_21(t1, p521_mod, q->y);
-    sp_521_norm_21(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_21(p->x, q->x) & sp_521_cmp_equal_21(p->z, q->z) &
-        (sp_521_cmp_equal_21(p->y, q->y) | sp_521_cmp_equal_21(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_21(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_21(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_21(p->x, t2) &
+            sp_521_cmp_equal_21(p->y, t4)) {
         sp_521_proj_point_dbl_21(r, p, t);
     }
     else {
@@ -37872,12 +37821,6 @@ static void sp_521_proj_point_add_qz1_21(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_21(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_21(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_21(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_21(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_21(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -42032,7 +41975,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_21(r);
 
-            if (sp_521_iszero_21(r) == 0) {
+            if (!sp_521_iszero_21(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 21, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -42047,7 +41990,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_21(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_21(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_21(s))) {
                     break;
                 }
             }
@@ -45012,7 +44955,7 @@ static void sp_1024_map_42(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_42(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 42, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 42, 0, sizeof(sp_digit) * 42U);
     sp_1024_mont_reduce_42(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_42(r->x, p1024_mod);
@@ -45021,7 +44964,7 @@ static void sp_1024_map_42(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_42(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 42, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 42, 0, sizeof(sp_digit) * 42U);
     sp_1024_mont_reduce_42(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_42(r->y, p1024_mod);
@@ -45030,7 +44973,6 @@ static void sp_1024_map_42(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -45184,6 +45126,61 @@ static void sp_1024_div2_42(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*42;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_42(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_42(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_42(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_42(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_42(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_42(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_42(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_42(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_42(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_42(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_42(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_42(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_42(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_42(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_42(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_42(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_42(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_42(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_42_ctx {
     int state;
@@ -45194,6 +45191,12 @@ typedef struct sp_1024_proj_point_dbl_42_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_42_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -45283,97 +45286,41 @@ static int sp_1024_proj_point_dbl_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 14:
         /* X = X - Y */
-        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->y, p1024_mod);
-        ctx->state = 15;
-        break;
-    case 15:
-        /* X = X - Y */
-        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->y, p1024_mod);
-        ctx->state = 16;
-        break;
-    case 16:
-        /* Y = Y - X */
-        sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 17;
-        break;
-    case 17:
-        /* Y = Y * T1 */
-        sp_1024_mont_mul_42(ctx->y, ctx->y, ctx->t1, p1024_mod, p1024_mp_mod);
-        ctx->state = 18;
-        break;
-    case 18:
-        /* Y = Y - T2 */
-        sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->t2, p1024_mod);
-        ctx->state = 19;
-        /* fall-through */
-    case 19:
-        err = MP_OKAY;
-        break;
-    }
-
-    if (err == MP_OKAY && ctx->state != 19) {
-        err = FP_WOULDBLOCK;
-    }
-
-    return err;
-}
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_42(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*42;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_42(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_42(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_42(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_42(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_42(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_42(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_42(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_42(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_42(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_42(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_42(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_42(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_42(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_42(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_42(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_42(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_42(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_42(y, y, t2, p1024_mod);
-}
+        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->y, p1024_mod);
+        ctx->state = 15;
+        break;
+    case 15:
+        /* X = X - Y */
+        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->y, p1024_mod);
+        ctx->state = 16;
+        break;
+    case 16:
+        /* Y = Y - X */
+        sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod);
+        ctx->state = 17;
+        break;
+    case 17:
+        /* Y = Y * T1 */
+        sp_1024_mont_mul_42(ctx->y, ctx->y, ctx->t1, p1024_mod, p1024_mp_mod);
+        ctx->state = 18;
+        break;
+    case 18:
+        /* Y = Y - T2 */
+        sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->t2, p1024_mod);
+        ctx->state = 19;
+        /* fall-through */
+    case 19:
+        err = MP_OKAY;
+        break;
+    }
+
+    if (err == MP_OKAY && ctx->state != 19) {
+        err = FP_WOULDBLOCK;
+    }
 
+    return err;
+}
+#endif /* WOLFSSL_SP_NONBLOCK */
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -45415,6 +45362,7 @@ static int sp_1024_iszero_42(const sp_digit* a)
             a[40] | a[41]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -45422,6 +45370,81 @@ static int sp_1024_iszero_42(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_42(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*42;
+    sp_digit* t3 = t + 4*42;
+    sp_digit* t4 = t + 6*42;
+    sp_digit* t5 = t + 8*42;
+    sp_digit* t6 = t + 10*42;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_42(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_42(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_42(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_42(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_42(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_42(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_42(t2, t1) &
+            sp_1024_cmp_equal_42(t4, t3)) {
+        sp_1024_proj_point_dbl_42(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_42(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_42(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_42(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_42(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_42(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_42(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_42(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_42(t3, y, p1024_mod);
+        sp_1024_mont_sub_42(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_42(y, y, x, p1024_mod);
+        sp_1024_mont_mul_42(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_42(y, y, t5, p1024_mod);
+        for (i = 0; i < 42; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 42; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 42; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_42_ctx {
@@ -45440,6 +45463,13 @@ typedef struct sp_1024_proj_point_add_42_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_42_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -45471,252 +45501,149 @@ static int sp_1024_proj_point_add_42_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_42(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_42(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_42(p->x, q->x) & sp_1024_cmp_equal_42(p->z, q->z) &
-            (sp_1024_cmp_equal_42(p->y, q->y) | sp_1024_cmp_equal_42(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_42(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_42_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_42(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_42(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_42(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_42(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_42(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_42(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_42(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_42(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_42(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_42(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_42(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_42(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_42(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_42(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_42(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_42(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_42(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_42(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_42(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_42(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_42(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_42(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_42(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_42(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_42(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_42(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_42(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_42(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_42(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_42(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_42(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_42(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_42(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_42(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_42(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_42(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_42(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 42; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 42; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 42; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_42(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*42;
-    sp_digit* t3 = t + 4*42;
-    sp_digit* t4 = t + 6*42;
-    sp_digit* t5 = t + 8*42;
-    sp_digit* t6 = t + 10*42;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_42(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_42(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_42(p->x, q->x) & sp_1024_cmp_equal_42(p->z, q->z) &
-        (sp_1024_cmp_equal_42(p->y, q->y) | sp_1024_cmp_equal_42(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_42(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_42(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_42(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_42(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_42(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_42(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_42(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_42(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_42(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_42(t2) & sp_1024_iszero_42(t4) & maskt) {
-            sp_1024_proj_point_dbl_42(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_42(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_42(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_42(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_42(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_42(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_42(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_42(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_42(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_42(t3, y, p1024_mod);
-            sp_1024_mont_sub_42(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_42(y, y, x, p1024_mod);
-            sp_1024_mont_mul_42(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_42(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 42; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 42; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 42; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifdef WOLFSSL_SP_SMALL
 /* Multiply the point by the scalar and return the result.
  * If map is true then convert result to affine coordinates.
@@ -46095,7 +46022,6 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_42(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_42(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -46113,7 +46039,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i,
         sp_1024_mont_sqr_42(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_42(t2, b, p1024_mod);
         sp_1024_mont_sub_42(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_42(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -46143,7 +46069,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i,
     sp_1024_mont_sqr_42(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_42(t2, b, p1024_mod);
     sp_1024_mont_sub_42(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_42(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -46153,7 +46079,7 @@ static void sp_1024_proj_point_dbl_n_42(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_42(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_42(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_42(y, y, p1024_mod);
 }
@@ -46210,7 +46136,7 @@ static void sp_1024_proj_point_dbl_n_store_42(sp_point_1024* r,
         sp_1024_mont_sqr_42(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_42(t2, b, p1024_mod);
         sp_1024_mont_sub_42(x, x, t2, p1024_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_42(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_42(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -46225,7 +46151,6 @@ static void sp_1024_proj_point_dbl_n_store_42(sp_point_1024* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_1024_mont_mul_42(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_42(y, y, t1, p1024_mod);
-
         /* Y = Y/2 */
         sp_1024_div2_42(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
@@ -46550,8 +46475,8 @@ static int sp_1024_ecc_mulmod_win_add_sub_42(sp_point_1024* r, const sp_point_10
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*42;
@@ -46560,12 +46485,17 @@ static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*42;
     sp_digit* t6 = t + 10*42;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_42(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_42(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_42(p->x, q->x) & sp_1024_cmp_equal_42(p->z, q->z) &
-        (sp_1024_cmp_equal_42(p->y, q->y) | sp_1024_cmp_equal_42(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_42(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_42(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_42(p->x, t2) &
+            sp_1024_cmp_equal_42(p->y, t4)) {
         sp_1024_proj_point_dbl_42(r, p, t);
     }
     else {
@@ -46577,12 +46507,6 @@ static void sp_1024_proj_point_add_qz1_42(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_42(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_42(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_42(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_42(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_42(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
diff --git a/wolfcrypt/src/sp_c64.c b/wolfcrypt/src/sp_c64.c
index 21f2653952..47a494a99c 100644
--- a/wolfcrypt/src/sp_c64.c
+++ b/wolfcrypt/src/sp_c64.c
@@ -22056,7 +22056,7 @@ static void sp_256_map_5(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_5(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 5, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 5, 0, sizeof(sp_digit) * 5U);
     sp_256_mont_reduce_5(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_5(r->x, p256_mod);
@@ -22065,7 +22065,7 @@ static void sp_256_map_5(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_5(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 5, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 5, 0, sizeof(sp_digit) * 5U);
     sp_256_mont_reduce_5(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_5(r->y, p256_mod);
@@ -22074,7 +22074,6 @@ static void sp_256_map_5(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -22231,6 +22230,61 @@ static void sp_256_div2_5(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*5;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_5(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_5(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_5(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_5(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_5(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_5(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_5(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_5(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_5(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_5(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_5(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_5(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_5(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_5(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_5(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_5(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_5(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_5(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_5_ctx {
     int state;
@@ -22241,6 +22295,12 @@ typedef struct sp_256_proj_point_dbl_5_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_5_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -22365,62 +22425,6 @@ static int sp_256_proj_point_dbl_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_5(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*5;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_5(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_5(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_5(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_5(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_5(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_5(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_5(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_5(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_5(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_5(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_5(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_5(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_5(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_5(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_5(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_5(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_5(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_5(y, y, t2, p256_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -22445,6 +22449,7 @@ static int sp_256_iszero_5(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -22452,6 +22457,81 @@ static int sp_256_iszero_5(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_5(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*5;
+    sp_digit* t3 = t + 4*5;
+    sp_digit* t4 = t + 6*5;
+    sp_digit* t5 = t + 8*5;
+    sp_digit* t6 = t + 10*5;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_5(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_5(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_5(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_5(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_5(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_5(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_5(t2, t1) &
+            sp_256_cmp_equal_5(t4, t3)) {
+        sp_256_proj_point_dbl_5(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_5(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_5(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_5(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_5(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_5(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_5(x, x, t5, p256_mod);
+        sp_256_mont_mul_5(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_5(t3, y, p256_mod);
+        sp_256_mont_sub_5(x, x, t3, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_5(y, y, x, p256_mod);
+        sp_256_mont_mul_5(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_5(y, y, t5, p256_mod);
+        for (i = 0; i < 5; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 5; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 5; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_5_ctx {
@@ -22470,6 +22550,13 @@ typedef struct sp_256_proj_point_add_5_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_5_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -22501,252 +22588,149 @@ static int sp_256_proj_point_add_5_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_5(ctx->t1, p256_mod, q->y);
-        sp_256_norm_5(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_5(p->x, q->x) & sp_256_cmp_equal_5(p->z, q->z) &
-            (sp_256_cmp_equal_5(p->y, q->y) | sp_256_cmp_equal_5(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_5(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_5_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_5(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_5(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_5(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_5(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_5(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_5(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_5(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_5(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_5(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_5(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_5(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_5(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_5(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_5(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_5(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_5(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_5(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_5(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_5(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_5(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_5(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_5(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_5(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_5(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_5(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_5(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_5(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_5(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_5(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_5(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_5(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_dbl_5(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_sub_5(ctx->x, ctx->x, ctx->t3, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_5(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_5(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_5(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_5(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_5(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_5(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_5(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_5(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_5(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 5; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 5; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 5; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_5(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*5;
-    sp_digit* t3 = t + 4*5;
-    sp_digit* t4 = t + 6*5;
-    sp_digit* t5 = t + 8*5;
-    sp_digit* t6 = t + 10*5;
-
-
-    /* Check double */
-    (void)sp_256_sub_5(t1, p256_mod, q->y);
-    sp_256_norm_5(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_5(p->x, q->x) & sp_256_cmp_equal_5(p->z, q->z) &
-        (sp_256_cmp_equal_5(p->y, q->y) | sp_256_cmp_equal_5(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_5(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_5(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_5(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_5(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_5(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_5(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_5(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_5(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_5(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_5(t2) & sp_256_iszero_5(t4) & maskt) {
-            sp_256_proj_point_dbl_5(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_5(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_5(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_5(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_5(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_5(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_5(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_5(x, x, t5, p256_mod);
-            sp_256_mont_mul_5(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_dbl_5(t3, y, p256_mod);
-            sp_256_mont_sub_5(x, x, t3, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_5(y, y, x, p256_mod);
-            sp_256_mont_mul_5(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_5(y, y, t5, p256_mod);
-
-            for (i = 0; i < 5; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 5; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 5; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
  * r  The resulting Montgomery form number.
@@ -23160,7 +23144,6 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_5(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_5(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -23178,7 +23161,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i,
         sp_256_mont_sqr_5(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_5(t2, b, p256_mod);
         sp_256_mont_sub_5(x, x, t2, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_5(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_5(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -23208,7 +23191,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i,
     sp_256_mont_sqr_5(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_dbl_5(t2, b, p256_mod);
     sp_256_mont_sub_5(x, x, t2, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_sub_lower_5(t2, b, x, p256_mod);
     sp_256_mont_dbl_lower_5(b, t2, p256_mod);
     /* Z = Z*Y */
@@ -23218,7 +23201,7 @@ static void sp_256_proj_point_dbl_n_5(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_5(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_5(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_5(y, y, p256_mod);
 }
@@ -23275,7 +23258,7 @@ static void sp_256_proj_point_dbl_n_store_5(sp_point_256* r,
         sp_256_mont_sqr_5(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_5(t2, b, p256_mod);
         sp_256_mont_sub_5(x, x, t2, p256_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_5(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_5(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -23290,7 +23273,6 @@ static void sp_256_proj_point_dbl_n_store_5(sp_point_256* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_256_mont_mul_5(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_5(y, y, t1, p256_mod);
-
         /* Y = Y/2 */
         sp_256_div2_5(r[j].y, y, p256_mod);
         r[j].infinity = 0;
@@ -23651,8 +23633,8 @@ static int sp_256_ecc_mulmod_win_add_sub_5(sp_point_256* r, const sp_point_256*
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_5(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_5(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*5;
@@ -23661,12 +23643,17 @@ static void sp_256_proj_point_add_qz1_5(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*5;
     sp_digit* t6 = t + 10*5;
 
-    /* Check double */
-    (void)sp_256_sub_5(t1, p256_mod, q->y);
-    sp_256_norm_5(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_5(p->x, q->x) & sp_256_cmp_equal_5(p->z, q->z) &
-        (sp_256_cmp_equal_5(p->y, q->y) | sp_256_cmp_equal_5(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_5(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_5(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_5(p->x, t2) &
+            sp_256_cmp_equal_5(p->y, t4)) {
         sp_256_proj_point_dbl_5(r, p, t);
     }
     else {
@@ -23678,12 +23665,6 @@ static void sp_256_proj_point_add_qz1_5(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_5(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_5(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_5(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_5(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_5(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -26689,7 +26670,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_5(r);
 
-            if (sp_256_iszero_5(r) == 0) {
+            if (!sp_256_iszero_5(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 5, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -26698,7 +26679,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_5(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_5(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_5(s))) {
                     break;
                 }
             }
@@ -28978,7 +28959,7 @@ static void sp_384_map_7(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_7(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 7, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 7, 0, sizeof(sp_digit) * 7U);
     sp_384_mont_reduce_7(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_7(r->x, p384_mod);
@@ -28987,7 +28968,7 @@ static void sp_384_map_7(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_7(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 7, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 7, 0, sizeof(sp_digit) * 7U);
     sp_384_mont_reduce_7(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_7(r->y, p384_mod);
@@ -28996,7 +28977,6 @@ static void sp_384_map_7(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -29157,6 +29137,61 @@ static void sp_384_div2_7(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*7;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_7(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_7(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_7(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_7(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_7(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_7(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_7(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_7(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_7(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_7(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_7(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_7(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_7(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_7(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_7(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_7(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_7(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_7(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_7_ctx {
     int state;
@@ -29167,6 +29202,12 @@ typedef struct sp_384_proj_point_dbl_7_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_7_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -29291,62 +29332,6 @@ static int sp_384_proj_point_dbl_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_7(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*7;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_7(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_7(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_7(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_7(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_7(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_7(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_7(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_7(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_7(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_7(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_7(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_7(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_7(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_7(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_7(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_7(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_7(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_7(y, y, t2, p384_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -29372,6 +29357,7 @@ static int sp_384_iszero_7(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -29379,6 +29365,81 @@ static int sp_384_iszero_7(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_7(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*7;
+    sp_digit* t3 = t + 4*7;
+    sp_digit* t4 = t + 6*7;
+    sp_digit* t5 = t + 8*7;
+    sp_digit* t6 = t + 10*7;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_7(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_7(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_7(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_7(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_7(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_7(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_7(t2, t1) &
+            sp_384_cmp_equal_7(t4, t3)) {
+        sp_384_proj_point_dbl_7(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_7(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_7(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_7(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_7(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_7(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_7(x, x, t5, p384_mod);
+        sp_384_mont_mul_7(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_7(t3, y, p384_mod);
+        sp_384_mont_sub_7(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_7(y, y, x, p384_mod);
+        sp_384_mont_mul_7(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_7(y, y, t5, p384_mod);
+        for (i = 0; i < 7; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 7; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 7; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_7_ctx {
@@ -29397,6 +29458,13 @@ typedef struct sp_384_proj_point_add_7_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_7_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -29428,251 +29496,148 @@ static int sp_384_proj_point_add_7_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_7(ctx->t1, p384_mod, q->y);
-        sp_384_norm_7(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_7(p->x, q->x) & sp_384_cmp_equal_7(p->z, q->z) &
-            (sp_384_cmp_equal_7(p->y, q->y) | sp_384_cmp_equal_7(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_7(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_7_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_7(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_7(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_7(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_7(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_7(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_7(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_7(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_7(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_7(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_7(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_7(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_7(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_7(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_7(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_7(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_7(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_7(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_7(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_7(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_7(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_7(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_7(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_7(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_7(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_7(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_7(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_7(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_7(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_7(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_7(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_7(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_7(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_7(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_7(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_7(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_7(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_7(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_7(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_7(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_7(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_7(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_7(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 7; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 7; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 7; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
-        err = FP_WOULDBLOCK;
-    }
-    return err;
-}
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_add_7(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*7;
-    sp_digit* t3 = t + 4*7;
-    sp_digit* t4 = t + 6*7;
-    sp_digit* t5 = t + 8*7;
-    sp_digit* t6 = t + 10*7;
-
-
-    /* Check double */
-    (void)sp_384_sub_7(t1, p384_mod, q->y);
-    sp_384_norm_7(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_7(p->x, q->x) & sp_384_cmp_equal_7(p->z, q->z) &
-        (sp_384_cmp_equal_7(p->y, q->y) | sp_384_cmp_equal_7(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_7(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_7(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_7(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_7(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_7(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_7(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_7(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_7(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_7(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_7(t2) & sp_384_iszero_7(t4) & maskt) {
-            sp_384_proj_point_dbl_7(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_7(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_7(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_7(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_7(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_7(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_7(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_7(x, x, t5, p384_mod);
-            sp_384_mont_mul_7(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_7(t3, y, p384_mod);
-            sp_384_mont_sub_7(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_7(y, y, x, p384_mod);
-            sp_384_mont_mul_7(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_7(y, y, t5, p384_mod);
-
-            for (i = 0; i < 7; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 7; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 7; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
+    if (err == MP_OKAY && ctx->state != 25) {
+        err = FP_WOULDBLOCK;
     }
+    return err;
 }
+#endif /* WOLFSSL_SP_NONBLOCK */
 
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
@@ -30123,7 +30088,6 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_7(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_7(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -30141,7 +30105,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i,
         sp_384_mont_sqr_7(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_7(t2, b, p384_mod);
         sp_384_mont_sub_7(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_7(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_7(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -30171,7 +30135,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i,
     sp_384_mont_sqr_7(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_7(t2, b, p384_mod);
     sp_384_mont_sub_7(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_7(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_7(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -30181,7 +30145,7 @@ static void sp_384_proj_point_dbl_n_7(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_7(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_7(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_7(y, y, p384_mod);
 }
@@ -30238,7 +30202,7 @@ static void sp_384_proj_point_dbl_n_store_7(sp_point_384* r,
         sp_384_mont_sqr_7(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_7(t2, b, p384_mod);
         sp_384_mont_sub_7(x, x, t2, p384_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_7(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_7(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -30253,7 +30217,6 @@ static void sp_384_proj_point_dbl_n_store_7(sp_point_384* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_384_mont_mul_7(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_7(y, y, t1, p384_mod);
-
         /* Y = Y/2 */
         sp_384_div2_7(r[j].y, y, p384_mod);
         r[j].infinity = 0;
@@ -30626,8 +30589,8 @@ static int sp_384_ecc_mulmod_win_add_sub_7(sp_point_384* r, const sp_point_384*
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_7(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_7(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*7;
@@ -30636,12 +30599,17 @@ static void sp_384_proj_point_add_qz1_7(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*7;
     sp_digit* t6 = t + 10*7;
 
-    /* Check double */
-    (void)sp_384_sub_7(t1, p384_mod, q->y);
-    sp_384_norm_7(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_7(p->x, q->x) & sp_384_cmp_equal_7(p->z, q->z) &
-        (sp_384_cmp_equal_7(p->y, q->y) | sp_384_cmp_equal_7(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_7(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_7(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_7(p->x, t2) &
+            sp_384_cmp_equal_7(p->y, t4)) {
         sp_384_proj_point_dbl_7(r, p, t);
     }
     else {
@@ -30653,12 +30621,6 @@ static void sp_384_proj_point_add_qz1_7(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_7(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_7(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_7(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_7(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_7(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -34161,7 +34123,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_7(r);
 
-            if (sp_384_iszero_7(r) == 0) {
+            if (!sp_384_iszero_7(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 7, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -34170,7 +34132,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_7(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_7(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_7(s))) {
                     break;
                 }
             }
@@ -36519,7 +36481,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_9(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 9, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_9(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_9(r->x, p521_mod);
@@ -36528,7 +36490,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_9(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 9, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_9(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_9(r->y, p521_mod);
@@ -36537,7 +36499,6 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -36702,6 +36663,61 @@ static void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_9(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_9(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_9(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_9(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_9(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_9(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_9(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_9(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_9(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_9(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_9(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_9(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_9(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_9(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_9(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_9_ctx {
     int state;
@@ -36712,6 +36728,12 @@ typedef struct sp_521_proj_point_dbl_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_9_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -36836,62 +36858,6 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_9(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_9(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_9(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_9(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_9(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_9(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_9(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_9(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_9(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_9(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_9(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_9(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_9(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_9(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_9(y, y, t2, p521_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -36918,6 +36884,7 @@ static int sp_521_iszero_9(const sp_digit* a)
             a[8]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -36925,6 +36892,81 @@ static int sp_521_iszero_9(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_9(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* t6 = t + 10*9;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(t2, t1) &
+            sp_521_cmp_equal_9(t4, t3)) {
+        sp_521_proj_point_dbl_9(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_9(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_9(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(x, x, t5, p521_mod);
+        sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_9(t3, y, p521_mod);
+        sp_521_mont_sub_9(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_9(y, y, x, p521_mod);
+        sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(y, y, t5, p521_mod);
+        for (i = 0; i < 9; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_9_ctx {
@@ -36943,6 +36985,13 @@ typedef struct sp_521_proj_point_add_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_9_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -36974,252 +37023,149 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_9(ctx->t1, p521_mod, q->y);
-        sp_521_norm_9(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-            (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_9(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_9(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_9(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_9(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_9(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 9; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_9(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* t3 = t + 4*9;
-    sp_digit* t4 = t + 6*9;
-    sp_digit* t5 = t + 8*9;
-    sp_digit* t6 = t + 10*9;
-
-
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_9(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_9(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_9(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_9(t2) & sp_521_iszero_9(t4) & maskt) {
-            sp_521_proj_point_dbl_9(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_9(x, x, t5, p521_mod);
-            sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_9(t3, y, p521_mod);
-            sp_521_mont_sub_9(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_9(y, y, x, p521_mod);
-            sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_9(y, y, t5, p521_mod);
-
-            for (i = 0; i < 9; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply a number by Montgomery normalizer mod modulus (prime).
  *
  * r  The resulting Montgomery form number.
@@ -37550,7 +37496,6 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_9(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_9(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -37568,7 +37513,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
         sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_9(t2, b, p521_mod);
         sp_521_mont_sub_9(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -37598,7 +37543,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_9(t2, b, p521_mod);
     sp_521_mont_sub_9(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_9(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -37608,7 +37553,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_9(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_9(y, y, p521_mod);
 }
@@ -37665,7 +37610,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_9(t2, b, p521_mod);
         sp_521_mont_sub_9(x, x, t2, p521_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -37680,7 +37625,6 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_9(y, y, t1, p521_mod);
-
         /* Y = Y/2 */
         sp_521_div2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
@@ -38065,8 +38009,8 @@ static int sp_521_ecc_mulmod_win_add_sub_9(sp_point_521* r, const sp_point_521*
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_9(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*9;
@@ -38075,12 +38019,17 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*9;
     sp_digit* t6 = t + 10*9;
 
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(p->x, t2) &
+            sp_521_cmp_equal_9(p->y, t4)) {
         sp_521_proj_point_dbl_9(r, p, t);
     }
     else {
@@ -38092,12 +38041,6 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_9(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -41630,7 +41573,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_9(r);
 
-            if (sp_521_iszero_9(r) == 0) {
+            if (!sp_521_iszero_9(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 9, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -41645,7 +41588,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_9(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_9(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_9(s))) {
                     break;
                 }
             }
@@ -44449,7 +44392,7 @@ static void sp_1024_map_18(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_18(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 18, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 18, 0, sizeof(sp_digit) * 18U);
     sp_1024_mont_reduce_18(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_18(r->x, p1024_mod);
@@ -44458,7 +44401,7 @@ static void sp_1024_map_18(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_18(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 18, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 18, 0, sizeof(sp_digit) * 18U);
     sp_1024_mont_reduce_18(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_18(r->y, p1024_mod);
@@ -44467,7 +44410,6 @@ static void sp_1024_map_18(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -44597,6 +44539,61 @@ static void sp_1024_div2_18(sp_digit* r, const sp_digit* a, const sp_digit* m)
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*18;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_18(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_18(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_18(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_18(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_18(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_18(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_18(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_18(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_18(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_18(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_18(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_18(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_18(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_18(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_18(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_18(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_18(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_18(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_18_ctx {
     int state;
@@ -44607,6 +44604,12 @@ typedef struct sp_1024_proj_point_dbl_18_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_18_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -44696,97 +44699,41 @@ static int sp_1024_proj_point_dbl_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         break;
     case 14:
         /* X = X - Y */
-        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->y, p1024_mod);
-        ctx->state = 15;
-        break;
-    case 15:
-        /* X = X - Y */
-        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->y, p1024_mod);
-        ctx->state = 16;
-        break;
-    case 16:
-        /* Y = Y - X */
-        sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 17;
-        break;
-    case 17:
-        /* Y = Y * T1 */
-        sp_1024_mont_mul_18(ctx->y, ctx->y, ctx->t1, p1024_mod, p1024_mp_mod);
-        ctx->state = 18;
-        break;
-    case 18:
-        /* Y = Y - T2 */
-        sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->t2, p1024_mod);
-        ctx->state = 19;
-        /* fall-through */
-    case 19:
-        err = MP_OKAY;
-        break;
-    }
-
-    if (err == MP_OKAY && ctx->state != 19) {
-        err = FP_WOULDBLOCK;
-    }
-
-    return err;
-}
-#endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_18(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*18;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_18(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_18(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_18(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_18(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_18(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_18(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_18(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_18(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_18(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_18(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_18(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_18(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_18(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_18(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_18(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_18(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_18(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_18(y, y, t2, p1024_mod);
-}
+        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->y, p1024_mod);
+        ctx->state = 15;
+        break;
+    case 15:
+        /* X = X - Y */
+        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->y, p1024_mod);
+        ctx->state = 16;
+        break;
+    case 16:
+        /* Y = Y - X */
+        sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod);
+        ctx->state = 17;
+        break;
+    case 17:
+        /* Y = Y * T1 */
+        sp_1024_mont_mul_18(ctx->y, ctx->y, ctx->t1, p1024_mod, p1024_mp_mod);
+        ctx->state = 18;
+        break;
+    case 18:
+        /* Y = Y - T2 */
+        sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->t2, p1024_mod);
+        ctx->state = 19;
+        /* fall-through */
+    case 19:
+        err = MP_OKAY;
+        break;
+    }
+
+    if (err == MP_OKAY && ctx->state != 19) {
+        err = FP_WOULDBLOCK;
+    }
 
+    return err;
+}
+#endif /* WOLFSSL_SP_NONBLOCK */
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -44817,6 +44764,7 @@ static int sp_1024_iszero_18(const sp_digit* a)
             a[16] | a[17]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -44824,6 +44772,81 @@ static int sp_1024_iszero_18(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_18(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*18;
+    sp_digit* t3 = t + 4*18;
+    sp_digit* t4 = t + 6*18;
+    sp_digit* t5 = t + 8*18;
+    sp_digit* t6 = t + 10*18;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_18(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_18(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_18(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_18(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_18(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_18(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_18(t2, t1) &
+            sp_1024_cmp_equal_18(t4, t3)) {
+        sp_1024_proj_point_dbl_18(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_18(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_18(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_18(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_18(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_18(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_18(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_18(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_18(t3, y, p1024_mod);
+        sp_1024_mont_sub_18(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_18(y, y, x, p1024_mod);
+        sp_1024_mont_mul_18(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_18(y, y, t5, p1024_mod);
+        for (i = 0; i < 18; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 18; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 18; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_18_ctx {
@@ -44842,6 +44865,13 @@ typedef struct sp_1024_proj_point_add_18_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_18_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -44873,252 +44903,149 @@ static int sp_1024_proj_point_add_18_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_18(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_18(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_18(p->x, q->x) & sp_1024_cmp_equal_18(p->z, q->z) &
-            (sp_1024_cmp_equal_18(p->y, q->y) | sp_1024_cmp_equal_18(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_18(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_18_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_18(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_18(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_18(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_18(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_18(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_18(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_18(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_18(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_18(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_18(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_18(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_18(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_18(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_18(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_18(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_18(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_18(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_18(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_18(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_18(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_18(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_18(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_18(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_18(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_18(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_18(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_18(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_18(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_18(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_18(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_18(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_18(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_18(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_18(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_18(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_18(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_18(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 18; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 18; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 18; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_18(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*18;
-    sp_digit* t3 = t + 4*18;
-    sp_digit* t4 = t + 6*18;
-    sp_digit* t5 = t + 8*18;
-    sp_digit* t6 = t + 10*18;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_18(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_18(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_18(p->x, q->x) & sp_1024_cmp_equal_18(p->z, q->z) &
-        (sp_1024_cmp_equal_18(p->y, q->y) | sp_1024_cmp_equal_18(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_18(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_18(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_18(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_18(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_18(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_18(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_18(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_18(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_18(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_18(t2) & sp_1024_iszero_18(t4) & maskt) {
-            sp_1024_proj_point_dbl_18(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_18(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_18(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_18(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_18(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_18(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_18(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_18(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_18(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_18(t3, y, p1024_mod);
-            sp_1024_mont_sub_18(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_18(y, y, x, p1024_mod);
-            sp_1024_mont_mul_18(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_18(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 18; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 18; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 18; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifdef WOLFSSL_SP_SMALL
 /* Multiply the point by the scalar and return the result.
  * If map is true then convert result to affine coordinates.
@@ -45449,7 +45376,6 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_18(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_18(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -45467,7 +45393,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i,
         sp_1024_mont_sqr_18(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_18(t2, b, p1024_mod);
         sp_1024_mont_sub_18(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_18(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -45497,7 +45423,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i,
     sp_1024_mont_sqr_18(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_18(t2, b, p1024_mod);
     sp_1024_mont_sub_18(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_18(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -45507,7 +45433,7 @@ static void sp_1024_proj_point_dbl_n_18(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_18(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_18(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_18(y, y, p1024_mod);
 }
@@ -45564,7 +45490,7 @@ static void sp_1024_proj_point_dbl_n_store_18(sp_point_1024* r,
         sp_1024_mont_sqr_18(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_18(t2, b, p1024_mod);
         sp_1024_mont_sub_18(x, x, t2, p1024_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_18(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_18(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -45579,7 +45505,6 @@ static void sp_1024_proj_point_dbl_n_store_18(sp_point_1024* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_1024_mont_mul_18(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_18(y, y, t1, p1024_mod);
-
         /* Y = Y/2 */
         sp_1024_div2_18(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
@@ -45904,8 +45829,8 @@ static int sp_1024_ecc_mulmod_win_add_sub_18(sp_point_1024* r, const sp_point_10
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*18;
@@ -45914,12 +45839,17 @@ static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*18;
     sp_digit* t6 = t + 10*18;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_18(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_18(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_18(p->x, q->x) & sp_1024_cmp_equal_18(p->z, q->z) &
-        (sp_1024_cmp_equal_18(p->y, q->y) | sp_1024_cmp_equal_18(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_18(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_18(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_18(p->x, t2) &
+            sp_1024_cmp_equal_18(p->y, t4)) {
         sp_1024_proj_point_dbl_18(r, p, t);
     }
     else {
@@ -45931,12 +45861,6 @@ static void sp_1024_proj_point_add_qz1_18(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_18(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_18(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_18(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_18(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_18(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c
index 99c6cc6411..f5539226d4 100644
--- a/wolfcrypt/src/sp_cortexm.c
+++ b/wolfcrypt/src/sp_cortexm.c
@@ -18802,7 +18802,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_8(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 8, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 8, 0, sizeof(sp_digit) * 8U);
     sp_256_mont_reduce_8(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_8(r->x, p256_mod);
@@ -18811,7 +18811,7 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_8(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 8, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 8, 0, sizeof(sp_digit) * 8U);
     sp_256_mont_reduce_8(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_8(r->y, p256_mod);
@@ -18820,7 +18820,6 @@ static void sp_256_map_8(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -19196,6 +19195,61 @@ SP_NOINLINE static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_d
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*8;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_8(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_8(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_8(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_8(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_8(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_8(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_8(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_8(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_8(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_8(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_8(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_8(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_8(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_8(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_8(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_8_ctx {
     int state;
@@ -19206,6 +19260,12 @@ typedef struct sp_256_proj_point_dbl_8_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_8_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -19330,62 +19390,6 @@ static int sp_256_proj_point_dbl_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_8(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*8;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_8(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_8(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_8(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_8(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_8(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_8(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_8(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_8(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_8(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_8(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_8(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_8(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_8(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_8(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_8(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_8(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_8(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_8(y, y, t2, p256_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -19411,6 +19415,7 @@ static int sp_256_iszero_8(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -19418,6 +19423,81 @@ static int sp_256_iszero_8(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_8(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*8;
+    sp_digit* t3 = t + 4*8;
+    sp_digit* t4 = t + 6*8;
+    sp_digit* t5 = t + 8*8;
+    sp_digit* t6 = t + 10*8;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_8(t2, t1) &
+            sp_256_cmp_equal_8(t4, t3)) {
+        sp_256_proj_point_dbl_8(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_8(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_8(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(x, x, t5, p256_mod);
+        sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_dbl_8(t3, y, p256_mod);
+        sp_256_mont_sub_8(x, x, t3, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_8(y, y, x, p256_mod);
+        sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(y, y, t5, p256_mod);
+        for (i = 0; i < 8; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 8; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 8; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_8_ctx {
@@ -19436,6 +19516,13 @@ typedef struct sp_256_proj_point_add_8_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_8_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -19467,252 +19554,149 @@ static int sp_256_proj_point_add_8_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_8(ctx->t1, p256_mod, q->y);
-        sp_256_norm_8(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-            (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_8(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_8_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_8(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_8(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_8(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_8(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_8(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_8(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_8(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_8(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_8(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_8(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_8(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_8(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_8(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_8(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_8(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_8(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_8(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_8(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_8(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_8(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_8(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_8(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_8(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_8(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_8(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_8(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_8(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 8; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 8; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 8; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_8(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*8;
-    sp_digit* t3 = t + 4*8;
-    sp_digit* t4 = t + 6*8;
-    sp_digit* t5 = t + 8*8;
-    sp_digit* t6 = t + 10*8;
-
-
-    /* Check double */
-    (void)sp_256_sub_8(t1, p256_mod, q->y);
-    sp_256_norm_8(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-        (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_8(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_8(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_8(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_8(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_8(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_8(t2) & sp_256_iszero_8(t4) & maskt) {
-            sp_256_proj_point_dbl_8(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_8(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_8(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_8(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_8(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_8(x, x, t5, p256_mod);
-            sp_256_mont_mul_8(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_dbl_8(t3, y, p256_mod);
-            sp_256_mont_sub_8(x, x, t3, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_8(y, y, x, p256_mod);
-            sp_256_mont_mul_8(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_8(y, y, t5, p256_mod);
-
-            for (i = 0; i < 8; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 8; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 8; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -20000,7 +19984,6 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_8(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_8(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -20018,7 +20001,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
         sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_dbl_8(t2, b, p256_mod);
         sp_256_mont_sub_8(x, x, t2, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_sub_lower_8(t2, b, x, p256_mod);
         sp_256_mont_dbl_lower_8(b, t2, p256_mod);
         /* Z = Z*Y */
@@ -20048,7 +20031,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     sp_256_mont_sqr_8(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_dbl_8(t2, b, p256_mod);
     sp_256_mont_sub_8(x, x, t2, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_sub_lower_8(t2, b, x, p256_mod);
     sp_256_mont_dbl_lower_8(b, t2, p256_mod);
     /* Z = Z*Y */
@@ -20058,7 +20041,7 @@ static void sp_256_proj_point_dbl_n_8(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_8(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_8(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_8(y, y, p256_mod);
 }
@@ -20103,8 +20086,8 @@ typedef struct sp_table_entry_256 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_8(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*8;
@@ -20113,12 +20096,17 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*8;
     sp_digit* t6 = t + 10*8;
 
-    /* Check double */
-    (void)sp_256_sub_8(t1, p256_mod, q->y);
-    sp_256_norm_8(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_8(p->x, q->x) & sp_256_cmp_equal_8(p->z, q->z) &
-        (sp_256_cmp_equal_8(p->y, q->y) | sp_256_cmp_equal_8(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_8(p->x, t2) &
+            sp_256_cmp_equal_8(p->y, t4)) {
         sp_256_proj_point_dbl_8(r, p, t);
     }
     else {
@@ -20130,12 +20118,6 @@ static void sp_256_proj_point_add_qz1_8(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_8(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_8(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_8(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_8(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -23710,7 +23692,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_8(r);
 
-            if (sp_256_iszero_8(r) == 0) {
+            if (!sp_256_iszero_8(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 8, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -23719,7 +23701,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_8(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_8(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_8(s))) {
                     break;
                 }
             }
@@ -26268,7 +26250,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_12(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 12, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 12, 0, sizeof(sp_digit) * 12U);
     sp_384_mont_reduce_12(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_12(r->x, p384_mod);
@@ -26277,7 +26259,7 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_12(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 12, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 12, 0, sizeof(sp_digit) * 12U);
     sp_384_mont_reduce_12(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_12(r->y, p384_mod);
@@ -26286,7 +26268,6 @@ static void sp_384_map_12(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -26470,6 +26451,61 @@ SP_NOINLINE static void sp_384_div2_12(sp_digit* r, const sp_digit* a, const sp_
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*12;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_12(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_12(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_12(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_12(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_12(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_12(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_12(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_12(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_12(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_12(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_12(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_12(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_12(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_12(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_12(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_12_ctx {
     int state;
@@ -26480,6 +26516,12 @@ typedef struct sp_384_proj_point_dbl_12_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_12_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -26604,62 +26646,6 @@ static int sp_384_proj_point_dbl_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_12(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*12;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_12(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_12(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_12(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_12(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_12(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_12(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_12(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_12(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_12(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_12(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_12(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_12(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_12(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_12(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_12(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_12(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_12(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_12(y, y, t2, p384_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -26687,6 +26673,7 @@ static int sp_384_iszero_12(const sp_digit* a)
             a[8] | a[9] | a[10] | a[11]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -26694,6 +26681,81 @@ static int sp_384_iszero_12(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_12(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*12;
+    sp_digit* t3 = t + 4*12;
+    sp_digit* t4 = t + 6*12;
+    sp_digit* t5 = t + 8*12;
+    sp_digit* t6 = t + 10*12;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_12(t2, t1) &
+            sp_384_cmp_equal_12(t4, t3)) {
+        sp_384_proj_point_dbl_12(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_12(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_12(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(x, x, t5, p384_mod);
+        sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_12(t3, y, p384_mod);
+        sp_384_mont_sub_12(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_12(y, y, x, p384_mod);
+        sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(y, y, t5, p384_mod);
+        for (i = 0; i < 12; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 12; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 12; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_12_ctx {
@@ -26712,6 +26774,13 @@ typedef struct sp_384_proj_point_add_12_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_12_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -26743,252 +26812,149 @@ static int sp_384_proj_point_add_12_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_12(ctx->t1, p384_mod, q->y);
-        sp_384_norm_12(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-            (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_12(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_12_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_12(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_12(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_12(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_12(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_12(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_12(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_12(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_12(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_12(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_12(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_12(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_12(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_12(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_12(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_12(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_12(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_12(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_12(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_12(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_12(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_12(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_12(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_12(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_12(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_12(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_12(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_12(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 12; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 12; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 12; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_384_proj_point_add_12(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*12;
-    sp_digit* t3 = t + 4*12;
-    sp_digit* t4 = t + 6*12;
-    sp_digit* t5 = t + 8*12;
-    sp_digit* t6 = t + 10*12;
-
-
-    /* Check double */
-    (void)sp_384_sub_12(t1, p384_mod, q->y);
-    sp_384_norm_12(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-        (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_12(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_12(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_12(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_12(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_12(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_12(t2) & sp_384_iszero_12(t4) & maskt) {
-            sp_384_proj_point_dbl_12(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_12(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_12(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_12(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_12(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_12(x, x, t5, p384_mod);
-            sp_384_mont_mul_12(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_12(t3, y, p384_mod);
-            sp_384_mont_sub_12(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_12(y, y, x, p384_mod);
-            sp_384_mont_mul_12(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_12(y, y, t5, p384_mod);
-
-            for (i = 0; i < 12; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 12; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 12; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -27300,7 +27266,6 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_12(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_12(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -27318,7 +27283,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
         sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_12(t2, b, p384_mod);
         sp_384_mont_sub_12(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_12(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_12(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -27348,7 +27313,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     sp_384_mont_sqr_12(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_12(t2, b, p384_mod);
     sp_384_mont_sub_12(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_12(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_12(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -27358,7 +27323,7 @@ static void sp_384_proj_point_dbl_n_12(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_12(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_12(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_12(y, y, p384_mod);
 }
@@ -27403,8 +27368,8 @@ typedef struct sp_table_entry_384 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_12(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*12;
@@ -27413,12 +27378,17 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*12;
     sp_digit* t6 = t + 10*12;
 
-    /* Check double */
-    (void)sp_384_sub_12(t1, p384_mod, q->y);
-    sp_384_norm_12(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_12(p->x, q->x) & sp_384_cmp_equal_12(p->z, q->z) &
-        (sp_384_cmp_equal_12(p->y, q->y) | sp_384_cmp_equal_12(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_12(p->x, t2) &
+            sp_384_cmp_equal_12(p->y, t4)) {
         sp_384_proj_point_dbl_12(r, p, t);
     }
     else {
@@ -27430,12 +27400,6 @@ static void sp_384_proj_point_add_qz1_12(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_12(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_12(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_12(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_12(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -31039,7 +31003,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_12(r);
 
-            if (sp_384_iszero_12(r) == 0) {
+            if (!sp_384_iszero_12(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 12, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -31048,7 +31012,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_12(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_12(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_12(s))) {
                     break;
                 }
             }
@@ -33807,7 +33771,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_17(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 17, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 17, 0, sizeof(sp_digit) * 17U);
     sp_521_mont_reduce_17(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_17(r->x, p521_mod);
@@ -33816,7 +33780,7 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_17(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 17, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 17, 0, sizeof(sp_digit) * 17U);
     sp_521_mont_reduce_17(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_17(r->y, p521_mod);
@@ -33825,7 +33789,6 @@ static void sp_521_map_17(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -34331,6 +34294,61 @@ SP_NOINLINE static void sp_521_div2_17(sp_digit* r, const sp_digit* a, const sp_
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*17;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_17(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_17(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_17(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_17(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_17(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_17(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_17(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_17(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_17(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_17(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_17(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_17(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_17(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_17(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_17(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_17_ctx {
     int state;
@@ -34341,6 +34359,12 @@ typedef struct sp_521_proj_point_dbl_17_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_17_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -34465,62 +34489,6 @@ static int sp_521_proj_point_dbl_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, co
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_17(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*17;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_17(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_17(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_17(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_17(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_17(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_17(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_17(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_17(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_17(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_17(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_17(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_17(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_17(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_17(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_17(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_17(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_17(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_17(y, y, t2, p521_mod);
-}
-
 /* Compare two numbers to determine if they are equal.
  * Constant time implementation.
  *
@@ -34551,6 +34519,7 @@ static int sp_521_iszero_17(const sp_digit* a)
             a[16]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -34558,6 +34527,81 @@ static int sp_521_iszero_17(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_17(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*17;
+    sp_digit* t3 = t + 4*17;
+    sp_digit* t4 = t + 6*17;
+    sp_digit* t5 = t + 8*17;
+    sp_digit* t6 = t + 10*17;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_17(t2, t1) &
+            sp_521_cmp_equal_17(t4, t3)) {
+        sp_521_proj_point_dbl_17(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_17(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_17(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(x, x, t5, p521_mod);
+        sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_17(t3, y, p521_mod);
+        sp_521_mont_sub_17(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_17(y, y, x, p521_mod);
+        sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(y, y, t5, p521_mod);
+        for (i = 0; i < 17; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 17; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 17; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_17_ctx {
@@ -34576,6 +34620,13 @@ typedef struct sp_521_proj_point_add_17_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_17_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -34607,252 +34658,149 @@ static int sp_521_proj_point_add_17_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_17(ctx->t1, p521_mod, q->y);
-        sp_521_norm_17(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-            (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_17(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_17_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_17(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_17(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_17(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_17(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_17(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_17(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_17(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_17(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_17(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_17(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_17(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_17(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_17(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_17(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_17(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_17(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_17(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_17(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_17(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_17(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_17(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_17(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_17(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_17(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_17(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_17(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_17(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 17; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 17; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 17; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_17(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*17;
-    sp_digit* t3 = t + 4*17;
-    sp_digit* t4 = t + 6*17;
-    sp_digit* t5 = t + 8*17;
-    sp_digit* t6 = t + 10*17;
-
-
-    /* Check double */
-    (void)sp_521_sub_17(t1, p521_mod, q->y);
-    sp_521_norm_17(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-        (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_17(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_17(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_17(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_17(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_17(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_17(t2) & sp_521_iszero_17(t4) & maskt) {
-            sp_521_proj_point_dbl_17(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_17(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_17(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_17(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_17(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_17(x, x, t5, p521_mod);
-            sp_521_mont_mul_17(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_17(t3, y, p521_mod);
-            sp_521_mont_sub_17(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_17(y, y, x, p521_mod);
-            sp_521_mont_mul_17(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_17(y, y, t5, p521_mod);
-
-            for (i = 0; i < 17; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 17; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 17; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 #ifndef WC_NO_CACHE_RESISTANT
 /* Touch each possible point that could be being copied.
  *
@@ -35198,7 +35146,6 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_17(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_17(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -35216,7 +35163,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
         sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_17(t2, b, p521_mod);
         sp_521_mont_sub_17(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_17(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_17(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -35246,7 +35193,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     sp_521_mont_sqr_17(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_17(t2, b, p521_mod);
     sp_521_mont_sub_17(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_17(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_17(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -35256,7 +35203,7 @@ static void sp_521_proj_point_dbl_n_17(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_17(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_17(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_17(y, y, p521_mod);
 }
@@ -35301,8 +35248,8 @@ typedef struct sp_table_entry_521 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_17(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*17;
@@ -35311,12 +35258,17 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*17;
     sp_digit* t6 = t + 10*17;
 
-    /* Check double */
-    (void)sp_521_sub_17(t1, p521_mod, q->y);
-    sp_521_norm_17(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_17(p->x, q->x) & sp_521_cmp_equal_17(p->z, q->z) &
-        (sp_521_cmp_equal_17(p->y, q->y) | sp_521_cmp_equal_17(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_17(p->x, t2) &
+            sp_521_cmp_equal_17(p->y, t4)) {
         sp_521_proj_point_dbl_17(r, p, t);
     }
     else {
@@ -35328,12 +35280,6 @@ static void sp_521_proj_point_add_qz1_17(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_17(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_17(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_17(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_17(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -39995,7 +39941,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_17(r);
 
-            if (sp_521_iszero_17(r) == 0) {
+            if (!sp_521_iszero_17(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 17, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -40009,7 +39955,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_17(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_17(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_17(s))) {
                     break;
                 }
             }
@@ -43352,7 +43298,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_32(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 32, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 32, 0, sizeof(sp_digit) * 32U);
     sp_1024_mont_reduce_32(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_32(r->x, p1024_mod);
@@ -43361,7 +43307,7 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_32(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 32, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 32, 0, sizeof(sp_digit) * 32U);
     sp_1024_mont_reduce_32(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_32(r->y, p1024_mod);
@@ -43370,7 +43316,6 @@ static void sp_1024_map_32(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 /* Add two Montgomery form numbers (r = a + b % m).
@@ -44366,47 +44311,108 @@ SP_NOINLINE static void sp_1024_div2_32(sp_digit* r, const sp_digit* a, const sp
  * p  Point to double.
  * t  Temporary ordinate data.
  */
-#ifdef WOLFSSL_SP_NONBLOCK
-typedef struct sp_1024_proj_point_dbl_32_ctx {
-    int state;
-    sp_digit* t1;
-    sp_digit* t2;
+static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*32;
     sp_digit* x;
     sp_digit* y;
     sp_digit* z;
-} sp_1024_proj_point_dbl_32_ctx;
-
-static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
-{
-    int err = FP_WOULDBLOCK;
-    sp_1024_proj_point_dbl_32_ctx* ctx = (sp_1024_proj_point_dbl_32_ctx*)sp_ctx->data;
 
-    typedef char ctx_size_test[sizeof(sp_1024_proj_point_dbl_32_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
-    (void)sizeof(ctx_size_test);
-
-    switch (ctx->state) {
-    case 0:
-        ctx->t1 = t;
-        ctx->t2 = t + 2*32;
-        ctx->x = r->x;
-        ctx->y = r->y;
-        ctx->z = r->z;
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
 
-        /* Put infinity into result. */
-        if (r != p) {
-            r->infinity = p->infinity;
-        }
-        ctx->state = 1;
-        break;
-    case 1:
-        /* T1 = Z * Z */
-        sp_1024_mont_sqr_32(ctx->t1, p->z, p1024_mod, p1024_mp_mod);
-        ctx->state = 2;
-        break;
-    case 2:
-        /* Z = Y * Z */
-        sp_1024_mont_mul_32(ctx->z, p->y, p->z, p1024_mod, p1024_mp_mod);
-        ctx->state = 3;
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_32(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_32(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_32(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_32(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_32(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_32(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_32(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_32(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_32(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_32(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_32(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_32(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_32(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_32(y, y, t2, p1024_mod);
+}
+
+#ifdef WOLFSSL_SP_NONBLOCK
+typedef struct sp_1024_proj_point_dbl_32_ctx {
+    int state;
+    sp_digit* t1;
+    sp_digit* t2;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+} sp_1024_proj_point_dbl_32_ctx;
+
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
+static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
+{
+    int err = FP_WOULDBLOCK;
+    sp_1024_proj_point_dbl_32_ctx* ctx = (sp_1024_proj_point_dbl_32_ctx*)sp_ctx->data;
+
+    typedef char ctx_size_test[sizeof(sp_1024_proj_point_dbl_32_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
+    (void)sizeof(ctx_size_test);
+
+    switch (ctx->state) {
+    case 0:
+        ctx->t1 = t;
+        ctx->t2 = t + 2*32;
+        ctx->x = r->x;
+        ctx->y = r->y;
+        ctx->z = r->z;
+
+        /* Put infinity into result. */
+        if (r != p) {
+            r->infinity = p->infinity;
+        }
+        ctx->state = 1;
+        break;
+    case 1:
+        /* T1 = Z * Z */
+        sp_1024_mont_sqr_32(ctx->t1, p->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
+        break;
+    case 2:
+        /* Z = Y * Z */
+        sp_1024_mont_mul_32(ctx->z, p->y, p->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
         /* Z = 2Z */
@@ -44500,62 +44506,6 @@ static int sp_1024_proj_point_dbl_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_32(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*32;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_32(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_32(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_32(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_32(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_32(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_32(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_32(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_32(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_32(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_32(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_32(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_32(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_32(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_32(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_32(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_32(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_32(y, y, t2, p1024_mod);
-}
-
 #ifdef WOLFSSL_SP_SMALL
 /* Sub b from a into r. (r = a - b)
  *
@@ -44735,6 +44685,7 @@ static int sp_1024_iszero_32(const sp_digit* a)
             a[24] | a[25] | a[26] | a[27] | a[28] | a[29] | a[30] | a[31]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -44742,6 +44693,81 @@ static int sp_1024_iszero_32(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_32(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*32;
+    sp_digit* t3 = t + 4*32;
+    sp_digit* t4 = t + 6*32;
+    sp_digit* t5 = t + 8*32;
+    sp_digit* t6 = t + 10*32;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_32(t2, t1) &
+            sp_1024_cmp_equal_32(t4, t3)) {
+        sp_1024_proj_point_dbl_32(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_32(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_32(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_32(t3, y, p1024_mod);
+        sp_1024_mont_sub_32(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
+        sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(y, y, t5, p1024_mod);
+        for (i = 0; i < 32; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 32; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 32; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_32_ctx {
@@ -44760,6 +44786,13 @@ typedef struct sp_1024_proj_point_add_32_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_32_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -44791,252 +44824,149 @@ static int sp_1024_proj_point_add_32_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_32(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_32(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-            (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_32(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_32_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_32(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_32(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_32(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_32(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_32(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_32(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_32(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_32(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_32(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_32(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_32(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_32(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_32(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_32(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_32(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_32(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_32(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_32(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_32(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_32(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_32(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_32(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_32(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_32(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_32(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_32(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 32; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 32; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 32; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_32(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*32;
-    sp_digit* t3 = t + 4*32;
-    sp_digit* t4 = t + 6*32;
-    sp_digit* t5 = t + 8*32;
-    sp_digit* t6 = t + 10*32;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_32(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-        (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_32(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_32(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_32(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_32(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_32(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_32(t2) & sp_1024_iszero_32(t4) & maskt) {
-            sp_1024_proj_point_dbl_32(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_32(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_32(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_32(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_32(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_32(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_32(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_32(t3, y, p1024_mod);
-            sp_1024_mont_sub_32(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_32(y, y, x, p1024_mod);
-            sp_1024_mont_mul_32(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_32(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 32; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 32; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 32; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Multiply the point by the scalar and return the result.
  * If map is true then convert result to affine coordinates.
  *
@@ -45212,7 +45142,6 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_32(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_32(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -45230,7 +45159,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
         sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_32(t2, b, p1024_mod);
         sp_1024_mont_sub_32(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_32(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -45260,7 +45189,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     sp_1024_mont_sqr_32(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_32(t2, b, p1024_mod);
     sp_1024_mont_sub_32(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_32(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_32(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -45270,7 +45199,7 @@ static void sp_1024_proj_point_dbl_n_32(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_32(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_32(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_32(y, y, p1024_mod);
 }
@@ -45315,8 +45244,8 @@ typedef struct sp_table_entry_1024 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*32;
@@ -45325,12 +45254,17 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*32;
     sp_digit* t6 = t + 10*32;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_32(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_32(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_32(p->x, q->x) & sp_1024_cmp_equal_32(p->z, q->z) &
-        (sp_1024_cmp_equal_32(p->y, q->y) | sp_1024_cmp_equal_32(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_32(p->x, t2) &
+            sp_1024_cmp_equal_32(p->y, t4)) {
         sp_1024_proj_point_dbl_32(r, p, t);
     }
     else {
@@ -45342,12 +45276,6 @@ static void sp_1024_proj_point_add_qz1_32(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_32(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_32(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_32(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_32(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
diff --git a/wolfcrypt/src/sp_x86_64.c b/wolfcrypt/src/sp_x86_64.c
index d77494169e..a33e3653db 100644
--- a/wolfcrypt/src/sp_x86_64.c
+++ b/wolfcrypt/src/sp_x86_64.c
@@ -8535,7 +8535,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_4(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 4, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 4, 0, sizeof(sp_digit) * 4U);
     sp_256_mont_reduce_4(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_4(r->x, p256_mod);
@@ -8544,7 +8544,7 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_4(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 4, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 4, 0, sizeof(sp_digit) * 4U);
     sp_256_mont_reduce_4(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_4(r->y, p256_mod);
@@ -8553,7 +8553,6 @@ static void sp_256_map_4(sp_point_256* r, const sp_point_256* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #ifdef __cplusplus
@@ -8604,6 +8603,61 @@ extern void sp_256_div2_4(sp_digit* r, const sp_digit* a, const sp_digit* m);
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*4;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_4(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_4(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_4(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_4(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_4(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_4(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_4(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_4(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_4(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_4(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_4(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_4(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_4(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_4(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_4_ctx {
     int state;
@@ -8614,6 +8668,12 @@ typedef struct sp_256_proj_point_dbl_4_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_4_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -8738,62 +8798,6 @@ static int sp_256_proj_point_dbl_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_4(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*4;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_4(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_4(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_4(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_4(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_4(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_4(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_4(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_4(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_4(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_4(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_4(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_4(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_4(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_4(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_4(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_4(y, y, t2, p256_mod);
-}
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -8843,7 +8847,6 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_4(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_4(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -8860,7 +8863,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
         /* X = A^2 - 2B */
         sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
         /* Z = Z*Y */
         sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
@@ -8888,7 +8891,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     /* X = A^2 - 2B */
     sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
     /* Z = Z*Y */
     sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
@@ -8897,7 +8900,7 @@ static void sp_256_proj_point_dbl_n_4(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_4(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_4(y, y, p256_mod);
 }
@@ -8926,6 +8929,7 @@ static int sp_256_iszero_4(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -8933,6 +8937,80 @@ static int sp_256_iszero_4(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_4(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*4;
+    sp_digit* t3 = t + 4*4;
+    sp_digit* t4 = t + 6*4;
+    sp_digit* t5 = t + 8*4;
+    sp_digit* t6 = t + 10*4;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_4(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_4(t2, t1) &
+            sp_256_cmp_equal_4(t4, t3)) {
+        sp_256_proj_point_dbl_4(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_4(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_4(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_4(x, x, t5, p256_mod);
+        sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_4(y, y, x, p256_mod);
+        sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_4(y, y, t5, p256_mod);
+        for (i = 0; i < 4; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 4; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 4; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_4_ctx {
@@ -8951,6 +9029,13 @@ typedef struct sp_256_proj_point_add_4_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_4_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -8982,251 +9067,148 @@ static int sp_256_proj_point_add_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_4(ctx->t1, p256_mod, q->y);
-        sp_256_norm_4(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-            (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_4(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_4_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_4(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_4(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_4(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_4(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_4(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_4(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_4(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_4(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_4(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_4(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_4(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_4(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_4(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_4(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_4(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_sub_dbl_4(ctx->x, ctx->x, ctx->y, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_4(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_4(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_4(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_4(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_4(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 4; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 4; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 4; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_4(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*4;
-    sp_digit* t3 = t + 4*4;
-    sp_digit* t4 = t + 6*4;
-    sp_digit* t5 = t + 8*4;
-    sp_digit* t6 = t + 10*4;
-
-
-    /* Check double */
-    (void)sp_256_sub_4(t1, p256_mod, q->y);
-    sp_256_norm_4(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_4(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_4(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_4(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_4(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_4(t2) & sp_256_iszero_4(t4) & maskt) {
-            sp_256_proj_point_dbl_4(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_4(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_4(x, x, t5, p256_mod);
-            sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_dbl_4(x, x, y, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_4(y, y, x, p256_mod);
-            sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_4(y, y, t5, p256_mod);
-
-            for (i = 0; i < 4; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 4; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 4; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -9277,7 +9259,7 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
         /* X = A^2 - 2B */
         sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_dbl_sub_4(b, b, x, p256_mod);
         /* Z = Z*Y */
         sp_256_mont_mul_4(r[j].z, z, y, p256_mod, p256_mp_mod);
@@ -9291,7 +9273,6 @@ static void sp_256_proj_point_dbl_n_store_4(sp_point_256* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_256_mont_mul_4(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_4(y, y, t1, p256_mod);
-
         /* Y = Y/2 */
         sp_256_div2_4(r[j].y, y, p256_mod);
         r[j].infinity = 0;
@@ -9747,7 +9728,7 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p,
 
     /* x /= z^2 */
     sp_256_mont_mul_avx2_4(r->x, p->x, t2, p256_mod, p256_mp_mod);
-    XMEMSET(r->x + 4, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 4, 0, sizeof(sp_digit) * 4U);
     sp_256_mont_reduce_avx2_4(r->x, p256_mod, p256_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_256_cmp_4(r->x, p256_mod);
@@ -9756,16 +9737,15 @@ static void sp_256_map_avx2_4(sp_point_256* r, const sp_point_256* p,
 
     /* y /= z^3 */
     sp_256_mont_mul_avx2_4(r->y, p->y, t1, p256_mod, p256_mp_mod);
-    XMEMSET(r->y + 4, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 4, 0, sizeof(sp_digit) * 4U);
     sp_256_mont_reduce_avx2_4(r->y, p256_mod, p256_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_256_cmp_4(r->y, p256_mod);
-    sp_256_cond_sub_avx2_4(r->y, r->y, p256_mod, ~(n >> 63));
+    sp_256_cond_sub_4(r->y, r->y, p256_mod, ~(n >> 63));
     sp_256_norm_4(r->y);
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #define sp_256_mont_add_avx2_4 sp_256_mont_add_4
@@ -9786,6 +9766,61 @@ extern void sp_256_div2_avx2_4(sp_digit* r, const sp_digit* a, const sp_digit* m
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*4;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_256_mont_sqr_avx2_4(t1, p->z, p256_mod, p256_mp_mod);
+    /* Z = Y * Z */
+    sp_256_mont_mul_avx2_4(z, p->y, p->z, p256_mod, p256_mp_mod);
+    /* Z = 2Z */
+    sp_256_mont_dbl_avx2_4(z, z, p256_mod);
+    /* T2 = X - T1 */
+    sp_256_mont_sub_avx2_4(t2, p->x, t1, p256_mod);
+    /* T1 = X + T1 */
+    sp_256_mont_add_avx2_4(t1, p->x, t1, p256_mod);
+    /* T2 = T1 * T2 */
+    sp_256_mont_mul_avx2_4(t2, t1, t2, p256_mod, p256_mp_mod);
+    /* T1 = 3T2 */
+    sp_256_mont_tpl_avx2_4(t1, t2, p256_mod);
+    /* Y = 2Y */
+    sp_256_mont_dbl_avx2_4(y, p->y, p256_mod);
+    /* Y = Y * Y */
+    sp_256_mont_sqr_avx2_4(y, y, p256_mod, p256_mp_mod);
+    /* T2 = Y * Y */
+    sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod);
+    /* T2 = T2/2 */
+    sp_256_div2_avx2_4(t2, t2, p256_mod);
+    /* Y = Y * X */
+    sp_256_mont_mul_avx2_4(y, y, p->x, p256_mod, p256_mp_mod);
+    /* X = T1 * T1 */
+    sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_avx2_4(x, x, y, p256_mod);
+    /* X = X - Y */
+    sp_256_mont_sub_avx2_4(x, x, y, p256_mod);
+    /* Y = Y - X */
+    sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod);
+    /* Y = Y * T1 */
+    sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
+    /* Y = Y - T2 */
+    sp_256_mont_sub_avx2_4(y, y, t2, p256_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_dbl_avx2_4_ctx {
     int state;
@@ -9796,6 +9831,12 @@ typedef struct sp_256_proj_point_dbl_avx2_4_ctx {
     sp_digit* z;
 } sp_256_proj_point_dbl_avx2_4_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r, const sp_point_256* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -9920,62 +9961,6 @@ static int sp_256_proj_point_dbl_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_256_proj_point_dbl_avx2_4(sp_point_256* r, const sp_point_256* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*4;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_256_mont_sqr_avx2_4(t1, p->z, p256_mod, p256_mp_mod);
-    /* Z = Y * Z */
-    sp_256_mont_mul_avx2_4(z, p->y, p->z, p256_mod, p256_mp_mod);
-    /* Z = 2Z */
-    sp_256_mont_dbl_avx2_4(z, z, p256_mod);
-    /* T2 = X - T1 */
-    sp_256_mont_sub_avx2_4(t2, p->x, t1, p256_mod);
-    /* T1 = X + T1 */
-    sp_256_mont_add_avx2_4(t1, p->x, t1, p256_mod);
-    /* T2 = T1 * T2 */
-    sp_256_mont_mul_avx2_4(t2, t1, t2, p256_mod, p256_mp_mod);
-    /* T1 = 3T2 */
-    sp_256_mont_tpl_avx2_4(t1, t2, p256_mod);
-    /* Y = 2Y */
-    sp_256_mont_dbl_avx2_4(y, p->y, p256_mod);
-    /* Y = Y * Y */
-    sp_256_mont_sqr_avx2_4(y, y, p256_mod, p256_mp_mod);
-    /* T2 = Y * Y */
-    sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod);
-    /* T2 = T2/2 */
-    sp_256_div2_avx2_4(t2, t2, p256_mod);
-    /* Y = Y * X */
-    sp_256_mont_mul_avx2_4(y, y, p->x, p256_mod, p256_mp_mod);
-    /* X = T1 * T1 */
-    sp_256_mont_sqr_avx2_4(x, t1, p256_mod, p256_mp_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_avx2_4(x, x, y, p256_mod);
-    /* X = X - Y */
-    sp_256_mont_sub_avx2_4(x, x, y, p256_mod);
-    /* Y = Y - X */
-    sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod);
-    /* Y = Y * T1 */
-    sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
-    /* Y = Y - T2 */
-    sp_256_mont_sub_avx2_4(y, y, t2, p256_mod);
-}
-
 #define sp_256_mont_tpl_lower_avx2_4 sp_256_mont_tpl_lower_4
 #define sp_256_mont_sub_dbl_avx2_4 sp_256_mont_sub_dbl_4
 #define sp_256_mont_dbl_sub_avx2_4 sp_256_mont_dbl_sub_4
@@ -10007,7 +9992,6 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
     /* W = Z^4 */
     sp_256_mont_sqr_avx2_4(w, z, p256_mod, p256_mp_mod);
     sp_256_mont_sqr_avx2_4(w, w, p256_mod, p256_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -10024,7 +10008,7 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
         /* X = A^2 - 2B */
         sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
         /* Z = Z*Y */
         sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
@@ -10051,8 +10035,8 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
     sp_256_mont_mul_avx2_4(b, t1, x, p256_mod, p256_mp_mod);
     /* X = A^2 - 2B */
     sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
-    sp_256_mont_sub_dbl_4(x, x, b, p256_mod);
-    /*   b = 2.(B - X) */
+    sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
+    /* B = 2.(B - X) */
     sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
     /* Z = Z*Y */
     sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
@@ -10061,11 +10045,12 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_256_mont_mul_avx2_4(y, b, a, p256_mod, p256_mp_mod);
     sp_256_mont_sub_avx2_4(y, y, t1, p256_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_256_div2_avx2_4(y, y, p256_mod);
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -10073,6 +10058,80 @@ static void sp_256_proj_point_dbl_n_avx2_4(sp_point_256* p, int i,
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
+        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*4;
+    sp_digit* t3 = t + 4*4;
+    sp_digit* t4 = t + 6*4;
+    sp_digit* t5 = t + 8*4;
+    sp_digit* t6 = t + 10*4;
+
+    /* U1 = X1*Z2^2 */
+    sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_avx2_4(t3, t1, q->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_avx2_4(t1, t1, p->x, p256_mod, p256_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_avx2_4(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_avx2_4(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_256_mont_mul_avx2_4(t3, t3, p->y, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_4(t2, t1) &
+            sp_256_cmp_equal_4(t4, t3)) {
+        sp_256_proj_point_dbl_avx2_4(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_avx2_4(t4, t4, t3, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(y, t1, t5, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_avx2_4(z, p->z, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
+        sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod);
+        sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
+        for (i = 0; i < 4; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 4; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 4; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_256_proj_point_add_avx2_4_ctx {
@@ -10091,6 +10150,13 @@ typedef struct sp_256_proj_point_add_avx2_4_ctx {
     sp_digit* z;
 } sp_256_proj_point_add_avx2_4_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
     const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
@@ -10122,251 +10188,148 @@ static int sp_256_proj_point_add_avx2_4_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_256_sub_avx2_4(ctx->t1, p256_mod, q->y);
-        sp_256_norm_avx2_4(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_256_cmp_equal_avx2_4(p->x, q->x) & sp_256_cmp_equal_avx2_4(p->z, q->z) &
-            (sp_256_cmp_equal_avx2_4(p->y, q->y) | sp_256_cmp_equal_avx2_4(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_256_mont_sqr_avx2_4(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_256_proj_point_dbl_avx2_4_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_256_mont_mul_avx2_4(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_256_mont_mul_avx2_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_avx2_4(ctx->t1, q->z, p256_mod, p256_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_256_mont_sqr_avx2_4(ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_256_mont_mul_avx2_4(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_256_mont_mul_avx2_4(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_avx2_4(ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_256_mont_mul_avx2_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_256_mont_mul_avx2_4(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_256_mont_mul_avx2_4(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_256_mont_mul_avx2_4(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_256_cmp_equal_4(ctx->t2, ctx->t1) &
+                sp_256_cmp_equal_4(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_256_proj_point_dbl_avx2_4(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_avx2_4(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
+        /* H = U2 - U1 */
+        sp_256_mont_sub_avx2_4(ctx->t2, ctx->t2, ctx->t1, p256_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_avx2_4(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
+        /* R = S2 - S1 */
+        sp_256_mont_sub_avx2_4(ctx->t4, ctx->t4, ctx->t3, p256_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_256_mont_sub_avx2_4(ctx->t2, ctx->t2, ctx->t1, p256_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_256_mont_sqr_avx2_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_256_mont_sub_avx2_4(ctx->t4, ctx->t4, ctx->t3, p256_mod);
+        sp_256_mont_mul_avx2_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_256_mont_sqr_avx2_4(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_256_mont_mul_avx2_4(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_256_mont_mul_avx2_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_256_mont_mul_avx2_4(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_avx2_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_256_mont_mul_avx2_4(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->t5, p256_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_256_mont_sqr_avx2_4(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->t5, p256_mod);
+        sp_256_mont_sub_dbl_avx2_4(ctx->x, ctx->x, ctx->y, p256_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_256_mont_mul_avx2_4(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_256_mont_sub_lower_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_256_mont_dbl_avx2_4(ctx->t3, ctx->y, p256_mod);
+        sp_256_mont_mul_avx2_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_256_mont_sub_avx2_4(ctx->x, ctx->x, ctx->t3, p256_mod);
+        sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->t5, p256_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_256_mont_sub_lower_avx2_4(ctx->y, ctx->y, ctx->x, p256_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_256_mont_mul_avx2_4(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_256_mont_sub_avx2_4(ctx->y, ctx->y, ctx->t5, p256_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 4; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 4; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 4; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_256_proj_point_add_avx2_4(sp_point_256* r,
-        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*4;
-    sp_digit* t3 = t + 4*4;
-    sp_digit* t4 = t + 6*4;
-    sp_digit* t5 = t + 8*4;
-    sp_digit* t6 = t + 10*4;
-
-
-    /* Check double */
-    (void)sp_256_sub_4(t1, p256_mod, q->y);
-    sp_256_norm_4(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) != 0) {
-        sp_256_proj_point_dbl_avx2_4(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_avx2_4(t3, t1, q->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_avx2_4(t1, t1, p->x, p256_mod, p256_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_avx2_4(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_avx2_4(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_256_mont_mul_avx2_4(t3, t3, p->y, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
-        /* H = U2 - U1 */
-        sp_256_mont_sub_avx2_4(t2, t2, t1, p256_mod);
-        /* R = S2 - S1 */
-        sp_256_mont_sub_avx2_4(t4, t4, t3, p256_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_256_iszero_4(t2) & sp_256_iszero_4(t4) & maskt) {
-            sp_256_proj_point_dbl_avx2_4(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_avx2_4(y, t1, t5, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_256_mont_mul_avx2_4(z, p->z, t2, p256_mod, p256_mp_mod);
-            sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod);
-            sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_avx2_4(x, x, t5, p256_mod);
-            sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_dbl_avx2_4(x, x, y, p256_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_256_mont_sub_lower_avx2_4(y, y, x, p256_mod);
-            sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
-            sp_256_mont_sub_avx2_4(y, y, t5, p256_mod);
-
-            for (i = 0; i < 4; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 4; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 4; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -10417,7 +10380,7 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
         /* X = A^2 - 2B */
         sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_dbl_avx2_4(x, x, b, p256_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_256_mont_dbl_sub_avx2_4(b, b, x, p256_mod);
         /* Z = Z*Y */
         sp_256_mont_mul_avx2_4(r[j].z, z, y, p256_mod, p256_mp_mod);
@@ -10431,7 +10394,6 @@ static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point_256* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_256_mont_mul_avx2_4(y, b, a, p256_mod, p256_mp_mod);
         sp_256_mont_sub_avx2_4(y, y, t1, p256_mod);
-
         /* Y = Y/2 */
         sp_256_div2_avx2_4(r[j].y, y, p256_mod);
         r[j].infinity = 0;
@@ -10680,8 +10642,8 @@ typedef struct sp_table_entry_256 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_4(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*4;
@@ -10690,12 +10652,17 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p,
     sp_digit* t5 = t + 8*4;
     sp_digit* t6 = t + 10*4;
 
-    /* Check double */
-    (void)sp_256_sub_4(t1, p256_mod, q->y);
-    sp_256_norm_4(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_4(p->x, t2) &
+            sp_256_cmp_equal_4(p->y, t4)) {
         sp_256_proj_point_dbl_4(r, p, t);
     }
     else {
@@ -10707,12 +10674,6 @@ static void sp_256_proj_point_add_qz1_4(sp_point_256* r, const sp_point_256* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_4(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_4(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -11168,8 +11129,8 @@ static int sp_256_ecc_mulmod_4(sp_point_256* r, const sp_point_256* g, const sp_
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, const sp_point_256* p,
-        const sp_point_256* q, sp_digit* t)
+static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r,
+    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*4;
@@ -11178,12 +11139,17 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, const sp_point_256
     sp_digit* t5 = t + 8*4;
     sp_digit* t6 = t + 10*4;
 
-    /* Check double */
-    (void)sp_256_sub_4(t1, p256_mod, q->y);
-    sp_256_norm_4(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
-        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_256_mont_sqr_avx2_4(t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_avx2_4(t4, t2, p->z, p256_mod, p256_mp_mod);
+    sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_256_cmp_equal_4(p->x, t2) &
+            sp_256_cmp_equal_4(p->y, t4)) {
         sp_256_proj_point_dbl_avx2_4(r, p, t);
     }
     else {
@@ -11195,12 +11161,6 @@ static void sp_256_proj_point_add_qz1_avx2_4(sp_point_256* r, const sp_point_256
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_256_mont_sqr_avx2_4(t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_avx2_4(t4, t2, p->z, p256_mod, p256_mp_mod);
-        sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
         /* H = U2 - X1 */
         sp_256_mont_sub_avx2_4(t2, t2, p->x, p256_mod);
         /* R = S2 - Y1 */
@@ -25767,7 +25727,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_256_norm_4(r);
 
-            if (sp_256_iszero_4(r) == 0) {
+            if (!sp_256_iszero_4(r)) {
                 /* x is modified in calculation of s. */
                 sp_256_from_mp(x, 4, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -25776,7 +25736,7 @@ int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_256_calc_s_4(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_256_iszero_4(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_256_iszero_4(s))) {
                     break;
                 }
             }
@@ -27712,7 +27672,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_6(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 6, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 6, 0, sizeof(sp_digit) * 6U);
     sp_384_mont_reduce_6(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_6(r->x, p384_mod);
@@ -27721,7 +27681,7 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_6(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 6, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 6, 0, sizeof(sp_digit) * 6U);
     sp_384_mont_reduce_6(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_6(r->y, p384_mod);
@@ -27730,7 +27690,6 @@ static void sp_384_map_6(sp_point_384* r, const sp_point_384* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #ifdef __cplusplus
@@ -27781,6 +27740,61 @@ extern void sp_384_div2_6(sp_digit* r, const sp_digit* a, const sp_digit* m);
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*6;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_6(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_6(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_6(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_6(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_6(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_6(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_6(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_6(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_6(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_6(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_6(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_6(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_6(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_6(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_6(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_6(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_6_ctx {
     int state;
@@ -27791,6 +27805,12 @@ typedef struct sp_384_proj_point_dbl_6_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_6_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -27915,62 +27935,6 @@ static int sp_384_proj_point_dbl_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_6(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*6;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_6(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_6(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_6(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_6(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_6(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_6(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_6(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_6(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_6(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_6(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_6(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_6(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_6(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_6(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_6(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_6(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_6(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_6(y, y, t2, p384_mod);
-}
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -28014,7 +27978,6 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_6(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_6(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -28032,7 +27995,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
         sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_6(t2, b, p384_mod);
         sp_384_mont_sub_6(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_6(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_6(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -28062,7 +28025,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_6(t2, b, p384_mod);
     sp_384_mont_sub_6(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_6(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_6(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -28072,7 +28035,7 @@ static void sp_384_proj_point_dbl_n_6(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_6(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_6(y, y, p384_mod);
 }
@@ -28101,6 +28064,7 @@ static int sp_384_iszero_6(const sp_digit* a)
     return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -28108,6 +28072,81 @@ static int sp_384_iszero_6(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_6(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*6;
+    sp_digit* t3 = t + 4*6;
+    sp_digit* t4 = t + 6*6;
+    sp_digit* t5 = t + 8*6;
+    sp_digit* t6 = t + 10*6;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_6(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_6(t2, t1) &
+            sp_384_cmp_equal_6(t4, t3)) {
+        sp_384_proj_point_dbl_6(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_6(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_6(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_6(x, x, t5, p384_mod);
+        sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_6(t3, y, p384_mod);
+        sp_384_mont_sub_6(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_6(y, y, x, p384_mod);
+        sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_6(y, y, t5, p384_mod);
+        for (i = 0; i < 6; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 6; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 6; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_6_ctx {
@@ -28126,6 +28165,13 @@ typedef struct sp_384_proj_point_add_6_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_6_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -28157,252 +28203,149 @@ static int sp_384_proj_point_add_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_6(ctx->t1, p384_mod, q->y);
-        sp_384_norm_6(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-            (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_6(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_6_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_6(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_6(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_6(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_6(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_6(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_6(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_6(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_6(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_6(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_6(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_6(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_6(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_6(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_6(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_6(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_6(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_6(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_6(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_6(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_6(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_6(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 6; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 6; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 6; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_384_proj_point_add_6(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*6;
-    sp_digit* t3 = t + 4*6;
-    sp_digit* t4 = t + 6*6;
-    sp_digit* t5 = t + 8*6;
-    sp_digit* t6 = t + 10*6;
-
-
-    /* Check double */
-    (void)sp_384_sub_6(t1, p384_mod, q->y);
-    sp_384_norm_6(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-        (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_6(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_6(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_6(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_6(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_6(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_6(t2) & sp_384_iszero_6(t4) & maskt) {
-            sp_384_proj_point_dbl_6(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_6(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_6(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_6(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_6(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_6(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_6(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_6(x, x, t5, p384_mod);
-            sp_384_mont_mul_6(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_6(t3, y, p384_mod);
-            sp_384_mont_sub_6(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_6(y, y, x, p384_mod);
-            sp_384_mont_mul_6(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_6(y, y, t5, p384_mod);
-
-            for (i = 0; i < 6; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 6; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 6; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -28455,7 +28398,7 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r,
         sp_384_mont_sqr_6(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_6(t2, b, p384_mod);
         sp_384_mont_sub_6(x, x, t2, p384_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_6(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_6(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -28470,7 +28413,6 @@ static void sp_384_proj_point_dbl_n_store_6(sp_point_384* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_384_mont_mul_6(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_6(y, y, t1, p384_mod);
-
         /* Y = Y/2 */
         sp_384_div2_6(r[j].y, y, p384_mod);
         r[j].infinity = 0;
@@ -28962,7 +28904,7 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p,
 
     /* x /= z^2 */
     sp_384_mont_mul_avx2_6(r->x, p->x, t2, p384_mod, p384_mp_mod);
-    XMEMSET(r->x + 6, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 6, 0, sizeof(sp_digit) * 6U);
     sp_384_mont_reduce_avx2_6(r->x, p384_mod, p384_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_384_cmp_6(r->x, p384_mod);
@@ -28971,16 +28913,15 @@ static void sp_384_map_avx2_6(sp_point_384* r, const sp_point_384* p,
 
     /* y /= z^3 */
     sp_384_mont_mul_avx2_6(r->y, p->y, t1, p384_mod, p384_mp_mod);
-    XMEMSET(r->y + 6, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 6, 0, sizeof(sp_digit) * 6U);
     sp_384_mont_reduce_avx2_6(r->y, p384_mod, p384_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_384_cmp_6(r->y, p384_mod);
-    sp_384_cond_sub_avx2_6(r->y, r->y, p384_mod, ~(n >> 63));
+    sp_384_cond_sub_6(r->y, r->y, p384_mod, ~(n >> 63));
     sp_384_norm_6(r->y);
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #define sp_384_mont_add_avx2_6 sp_384_mont_add_6
@@ -29001,6 +28942,61 @@ extern void sp_384_div2_avx2_6(sp_digit* r, const sp_digit* a, const sp_digit* m
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*6;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_384_mont_sqr_avx2_6(t1, p->z, p384_mod, p384_mp_mod);
+    /* Z = Y * Z */
+    sp_384_mont_mul_avx2_6(z, p->y, p->z, p384_mod, p384_mp_mod);
+    /* Z = 2Z */
+    sp_384_mont_dbl_avx2_6(z, z, p384_mod);
+    /* T2 = X - T1 */
+    sp_384_mont_sub_avx2_6(t2, p->x, t1, p384_mod);
+    /* T1 = X + T1 */
+    sp_384_mont_add_avx2_6(t1, p->x, t1, p384_mod);
+    /* T2 = T1 * T2 */
+    sp_384_mont_mul_avx2_6(t2, t1, t2, p384_mod, p384_mp_mod);
+    /* T1 = 3T2 */
+    sp_384_mont_tpl_avx2_6(t1, t2, p384_mod);
+    /* Y = 2Y */
+    sp_384_mont_dbl_avx2_6(y, p->y, p384_mod);
+    /* Y = Y * Y */
+    sp_384_mont_sqr_avx2_6(y, y, p384_mod, p384_mp_mod);
+    /* T2 = Y * Y */
+    sp_384_mont_sqr_avx2_6(t2, y, p384_mod, p384_mp_mod);
+    /* T2 = T2/2 */
+    sp_384_div2_avx2_6(t2, t2, p384_mod);
+    /* Y = Y * X */
+    sp_384_mont_mul_avx2_6(y, y, p->x, p384_mod, p384_mp_mod);
+    /* X = T1 * T1 */
+    sp_384_mont_sqr_avx2_6(x, t1, p384_mod, p384_mp_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_avx2_6(x, x, y, p384_mod);
+    /* X = X - Y */
+    sp_384_mont_sub_avx2_6(x, x, y, p384_mod);
+    /* Y = Y - X */
+    sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod);
+    /* Y = Y * T1 */
+    sp_384_mont_mul_avx2_6(y, y, t1, p384_mod, p384_mp_mod);
+    /* Y = Y - T2 */
+    sp_384_mont_sub_avx2_6(y, y, t2, p384_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_dbl_avx2_6_ctx {
     int state;
@@ -29011,6 +29007,12 @@ typedef struct sp_384_proj_point_dbl_avx2_6_ctx {
     sp_digit* z;
 } sp_384_proj_point_dbl_avx2_6_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r, const sp_point_384* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -29135,62 +29137,6 @@ static int sp_384_proj_point_dbl_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_384_proj_point_dbl_avx2_6(sp_point_384* r, const sp_point_384* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*6;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_384_mont_sqr_avx2_6(t1, p->z, p384_mod, p384_mp_mod);
-    /* Z = Y * Z */
-    sp_384_mont_mul_avx2_6(z, p->y, p->z, p384_mod, p384_mp_mod);
-    /* Z = 2Z */
-    sp_384_mont_dbl_avx2_6(z, z, p384_mod);
-    /* T2 = X - T1 */
-    sp_384_mont_sub_avx2_6(t2, p->x, t1, p384_mod);
-    /* T1 = X + T1 */
-    sp_384_mont_add_avx2_6(t1, p->x, t1, p384_mod);
-    /* T2 = T1 * T2 */
-    sp_384_mont_mul_avx2_6(t2, t1, t2, p384_mod, p384_mp_mod);
-    /* T1 = 3T2 */
-    sp_384_mont_tpl_avx2_6(t1, t2, p384_mod);
-    /* Y = 2Y */
-    sp_384_mont_dbl_avx2_6(y, p->y, p384_mod);
-    /* Y = Y * Y */
-    sp_384_mont_sqr_avx2_6(y, y, p384_mod, p384_mp_mod);
-    /* T2 = Y * Y */
-    sp_384_mont_sqr_avx2_6(t2, y, p384_mod, p384_mp_mod);
-    /* T2 = T2/2 */
-    sp_384_div2_avx2_6(t2, t2, p384_mod);
-    /* Y = Y * X */
-    sp_384_mont_mul_avx2_6(y, y, p->x, p384_mod, p384_mp_mod);
-    /* X = T1 * T1 */
-    sp_384_mont_sqr_avx2_6(x, t1, p384_mod, p384_mp_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_avx2_6(x, x, y, p384_mod);
-    /* X = X - Y */
-    sp_384_mont_sub_avx2_6(x, x, y, p384_mod);
-    /* Y = Y - X */
-    sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod);
-    /* Y = Y * T1 */
-    sp_384_mont_mul_avx2_6(y, y, t1, p384_mod, p384_mp_mod);
-    /* Y = Y - T2 */
-    sp_384_mont_sub_avx2_6(y, y, t2, p384_mod);
-}
-
 #define sp_384_mont_dbl_lower_avx2_6 sp_384_mont_dbl_lower_6
 #define sp_384_mont_tpl_lower_avx2_6 sp_384_mont_tpl_lower_6
 /* Double the Montgomery form projective point p a number of times.
@@ -29222,7 +29168,6 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i,
     /* W = Z^4 */
     sp_384_mont_sqr_avx2_6(w, z, p384_mod, p384_mp_mod);
     sp_384_mont_sqr_avx2_6(w, w, p384_mod, p384_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -29240,7 +29185,7 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i,
         sp_384_mont_sqr_avx2_6(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_avx2_6(t2, b, p384_mod);
         sp_384_mont_sub_avx2_6(x, x, t2, p384_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -29270,7 +29215,7 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i,
     sp_384_mont_sqr_avx2_6(x, a, p384_mod, p384_mp_mod);
     sp_384_mont_dbl_avx2_6(t2, b, p384_mod);
     sp_384_mont_sub_avx2_6(x, x, t2, p384_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod);
     sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod);
     /* Z = Z*Y */
@@ -29280,11 +29225,12 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_384_mont_mul_avx2_6(y, b, a, p384_mod, p384_mp_mod);
     sp_384_mont_sub_avx2_6(y, y, t1, p384_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_384_div2_avx2_6(y, y, p384_mod);
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -29292,6 +29238,81 @@ static void sp_384_proj_point_dbl_n_avx2_6(sp_point_384* p, int i,
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_384_proj_point_add_avx2_6(sp_point_384* r,
+        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*6;
+    sp_digit* t3 = t + 4*6;
+    sp_digit* t4 = t + 6*6;
+    sp_digit* t5 = t + 8*6;
+    sp_digit* t6 = t + 10*6;
+
+    /* U1 = X1*Z2^2 */
+    sp_384_mont_sqr_avx2_6(t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_avx2_6(t3, t1, q->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_avx2_6(t1, t1, p->x, p384_mod, p384_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_avx2_6(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_avx2_6(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_384_mont_mul_avx2_6(t3, t3, p->y, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_6(t2, t1) &
+            sp_384_cmp_equal_6(t4, t3)) {
+        sp_384_proj_point_dbl_avx2_6(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_384_mont_sub_avx2_6(t2, t2, t1, p384_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_avx2_6(t4, t4, t3, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_avx2_6(t5, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(y, t1, t5, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(t5, t5, t2, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_avx2_6(z, p->z, t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(z, z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_avx2_6(x, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_avx2_6(x, x, t5, p384_mod);
+        sp_384_mont_mul_avx2_6(t5, t5, t3, p384_mod, p384_mp_mod);
+        sp_384_mont_dbl_avx2_6(t3, y, p384_mod);
+        sp_384_mont_sub_avx2_6(x, x, t3, p384_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod);
+        sp_384_mont_mul_avx2_6(y, y, t4, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_avx2_6(y, y, t5, p384_mod);
+        for (i = 0; i < 6; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 6; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 6; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_384_proj_point_add_avx2_6_ctx {
@@ -29310,6 +29331,13 @@ typedef struct sp_384_proj_point_add_avx2_6_ctx {
     sp_digit* z;
 } sp_384_proj_point_add_avx2_6_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
     const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
@@ -29341,252 +29369,149 @@ static int sp_384_proj_point_add_avx2_6_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_384_sub_avx2_6(ctx->t1, p384_mod, q->y);
-        sp_384_norm_avx2_6(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_384_cmp_equal_avx2_6(p->x, q->x) & sp_384_cmp_equal_avx2_6(p->z, q->z) &
-            (sp_384_cmp_equal_avx2_6(p->y, q->y) | sp_384_cmp_equal_avx2_6(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_384_mont_sqr_avx2_6(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_384_proj_point_dbl_avx2_6_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_384_mont_mul_avx2_6(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_384_mont_mul_avx2_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_avx2_6(ctx->t1, q->z, p384_mod, p384_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_384_mont_sqr_avx2_6(ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_384_mont_mul_avx2_6(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_384_mont_mul_avx2_6(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_avx2_6(ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_384_mont_mul_avx2_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_384_mont_mul_avx2_6(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_384_mont_mul_avx2_6(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_384_mont_mul_avx2_6(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_384_cmp_equal_6(ctx->t2, ctx->t1) &
+                sp_384_cmp_equal_6(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_384_proj_point_dbl_avx2_6(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_avx2_6(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
+        /* H = U2 - U1 */
+        sp_384_mont_sub_avx2_6(ctx->t2, ctx->t2, ctx->t1, p384_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_avx2_6(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
+        /* R = S2 - S1 */
+        sp_384_mont_sub_avx2_6(ctx->t4, ctx->t4, ctx->t3, p384_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_384_mont_sub_avx2_6(ctx->t2, ctx->t2, ctx->t1, p384_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_384_mont_sqr_avx2_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_384_mont_sub_avx2_6(ctx->t4, ctx->t4, ctx->t3, p384_mod);
+        sp_384_mont_mul_avx2_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_384_mont_sqr_avx2_6(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_384_mont_mul_avx2_6(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_384_mont_mul_avx2_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_384_mont_mul_avx2_6(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
+        sp_384_mont_sqr_avx2_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_384_mont_mul_avx2_6(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
+        sp_384_mont_sub_avx2_6(ctx->x, ctx->x, ctx->t5, p384_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_384_mont_sqr_avx2_6(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
+        sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_384_mont_sub_avx2_6(ctx->x, ctx->x, ctx->t5, p384_mod);
+        sp_384_mont_dbl_avx2_6(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_sub_avx2_6(ctx->x, ctx->x, ctx->t3, p384_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_384_mont_mul_avx2_6(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_384_mont_sub_lower_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_384_mont_dbl_avx2_6(ctx->t3, ctx->y, p384_mod);
+        sp_384_mont_mul_avx2_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_384_mont_sub_avx2_6(ctx->x, ctx->x, ctx->t3, p384_mod);
+        sp_384_mont_sub_avx2_6(ctx->y, ctx->y, ctx->t5, p384_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_384_mont_sub_lower_avx2_6(ctx->y, ctx->y, ctx->x, p384_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_384_mont_mul_avx2_6(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_384_mont_sub_avx2_6(ctx->y, ctx->y, ctx->t5, p384_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 6; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 6; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 6; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_384_proj_point_add_avx2_6(sp_point_384* r,
-        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*6;
-    sp_digit* t3 = t + 4*6;
-    sp_digit* t4 = t + 6*6;
-    sp_digit* t5 = t + 8*6;
-    sp_digit* t6 = t + 10*6;
-
-
-    /* Check double */
-    (void)sp_384_sub_6(t1, p384_mod, q->y);
-    sp_384_norm_6(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-        (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, t1))) != 0) {
-        sp_384_proj_point_dbl_avx2_6(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_384_mont_sqr_avx2_6(t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_avx2_6(t3, t1, q->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_avx2_6(t1, t1, p->x, p384_mod, p384_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_avx2_6(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_avx2_6(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_384_mont_mul_avx2_6(t3, t3, p->y, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod);
-        /* H = U2 - U1 */
-        sp_384_mont_sub_avx2_6(t2, t2, t1, p384_mod);
-        /* R = S2 - S1 */
-        sp_384_mont_sub_avx2_6(t4, t4, t3, p384_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_384_iszero_6(t2) & sp_384_iszero_6(t4) & maskt) {
-            sp_384_proj_point_dbl_avx2_6(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_384_mont_sqr_avx2_6(t5, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_avx2_6(y, t1, t5, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_avx2_6(t5, t5, t2, p384_mod, p384_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_384_mont_mul_avx2_6(z, p->z, t2, p384_mod, p384_mp_mod);
-            sp_384_mont_mul_avx2_6(z, z, q->z, p384_mod, p384_mp_mod);
-            sp_384_mont_sqr_avx2_6(x, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_avx2_6(x, x, t5, p384_mod);
-            sp_384_mont_mul_avx2_6(t5, t5, t3, p384_mod, p384_mp_mod);
-            sp_384_mont_dbl_avx2_6(t3, y, p384_mod);
-            sp_384_mont_sub_avx2_6(x, x, t3, p384_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_384_mont_sub_lower_avx2_6(y, y, x, p384_mod);
-            sp_384_mont_mul_avx2_6(y, y, t4, p384_mod, p384_mp_mod);
-            sp_384_mont_sub_avx2_6(y, y, t5, p384_mod);
-
-            for (i = 0; i < 6; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 6; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 6; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -29639,7 +29564,7 @@ static void sp_384_proj_point_dbl_n_store_avx2_6(sp_point_384* r,
         sp_384_mont_sqr_avx2_6(x, a, p384_mod, p384_mp_mod);
         sp_384_mont_dbl_avx2_6(t2, b, p384_mod);
         sp_384_mont_sub_avx2_6(x, x, t2, p384_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_384_mont_sub_lower_avx2_6(t2, b, x, p384_mod);
         sp_384_mont_dbl_lower_avx2_6(b, t2, p384_mod);
         /* Z = Z*Y */
@@ -29654,7 +29579,6 @@ static void sp_384_proj_point_dbl_n_store_avx2_6(sp_point_384* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_384_mont_mul_avx2_6(y, b, a, p384_mod, p384_mp_mod);
         sp_384_mont_sub_avx2_6(y, y, t1, p384_mod);
-
         /* Y = Y/2 */
         sp_384_div2_avx2_6(r[j].y, y, p384_mod);
         r[j].infinity = 0;
@@ -29903,8 +29827,8 @@ typedef struct sp_table_entry_384 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_6(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*6;
@@ -29913,12 +29837,17 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p,
     sp_digit* t5 = t + 8*6;
     sp_digit* t6 = t + 10*6;
 
-    /* Check double */
-    (void)sp_384_sub_6(t1, p384_mod, q->y);
-    sp_384_norm_6(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-        (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_6(p->x, t2) &
+            sp_384_cmp_equal_6(p->y, t4)) {
         sp_384_proj_point_dbl_6(r, p, t);
     }
     else {
@@ -29930,12 +29859,6 @@ static void sp_384_proj_point_add_qz1_6(sp_point_384* r, const sp_point_384* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_6(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_6(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_6(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_6(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -30392,8 +30315,8 @@ static int sp_384_ecc_mulmod_6(sp_point_384* r, const sp_point_384* g, const sp_
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, const sp_point_384* p,
-        const sp_point_384* q, sp_digit* t)
+static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r,
+    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*6;
@@ -30402,12 +30325,17 @@ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, const sp_point_384
     sp_digit* t5 = t + 8*6;
     sp_digit* t6 = t + 10*6;
 
-    /* Check double */
-    (void)sp_384_sub_6(t1, p384_mod, q->y);
-    sp_384_norm_6(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_384_cmp_equal_6(p->x, q->x) & sp_384_cmp_equal_6(p->z, q->z) &
-        (sp_384_cmp_equal_6(p->y, q->y) | sp_384_cmp_equal_6(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_384_mont_sqr_avx2_6(t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_avx2_6(t4, t2, p->z, p384_mod, p384_mp_mod);
+    sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_384_cmp_equal_6(p->x, t2) &
+            sp_384_cmp_equal_6(p->y, t4)) {
         sp_384_proj_point_dbl_avx2_6(r, p, t);
     }
     else {
@@ -30419,12 +30347,6 @@ static void sp_384_proj_point_add_qz1_avx2_6(sp_point_384* r, const sp_point_384
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_384_mont_sqr_avx2_6(t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_avx2_6(t4, t2, p->z, p384_mod, p384_mp_mod);
-        sp_384_mont_mul_avx2_6(t2, t2, q->x, p384_mod, p384_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_384_mont_mul_avx2_6(t4, t4, q->y, p384_mod, p384_mp_mod);
         /* H = U2 - X1 */
         sp_384_mont_sub_avx2_6(t2, t2, p->x, p384_mod);
         /* R = S2 - Y1 */
@@ -50664,7 +50586,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_384_norm_6(r);
 
-            if (sp_384_iszero_6(r) == 0) {
+            if (!sp_384_iszero_6(r)) {
                 /* x is modified in calculation of s. */
                 sp_384_from_mp(x, 6, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -50673,7 +50595,7 @@ int sp_ecc_sign_384(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_384_calc_s_6(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_384_iszero_6(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_384_iszero_6(s))) {
                     break;
                 }
             }
@@ -52645,7 +52567,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_9(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 9, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_9(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_9(r->x, p521_mod);
@@ -52654,7 +52576,7 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_9(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 9, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_9(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_9(r->y, p521_mod);
@@ -52663,7 +52585,6 @@ static void sp_521_map_9(sp_point_521* r, const sp_point_521* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #ifdef __cplusplus
@@ -52708,6 +52629,61 @@ extern void sp_521_div2_9(sp_digit* r, const sp_digit* a, const sp_digit* m);
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_9(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_9(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_9(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_9(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_9(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_9(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_9(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_9(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_9(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_9(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_9(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_9(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_9(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_9(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_9(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_9_ctx {
     int state;
@@ -52718,6 +52694,12 @@ typedef struct sp_521_proj_point_dbl_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_9_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -52842,62 +52824,6 @@ static int sp_521_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, con
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_9(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_9(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_9(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_9(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_9(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_9(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_9(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_9(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_9(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_9(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_9(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_9(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_9(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_9(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_9(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_9(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_9(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_9(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_9(y, y, t2, p521_mod);
-}
-
 #define sp_521_mont_dbl_lower_9 sp_521_mont_dbl_9
 #define sp_521_mont_tpl_lower_9 sp_521_mont_tpl_9
 /* Double the Montgomery form projective point p a number of times.
@@ -52929,7 +52855,6 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_9(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_9(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -52947,7 +52872,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
         sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_9(t2, b, p521_mod);
         sp_521_mont_sub_9(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -52977,7 +52902,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_9(t2, b, p521_mod);
     sp_521_mont_sub_9(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_9(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -52987,7 +52912,7 @@ static void sp_521_proj_point_dbl_n_9(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_9(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_9(y, y, p521_mod);
 }
@@ -53018,6 +52943,7 @@ static int sp_521_iszero_9(const sp_digit* a)
             a[8]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -53025,6 +52951,81 @@ static int sp_521_iszero_9(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_9(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* t6 = t + 10*9;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(t2, t1) &
+            sp_521_cmp_equal_9(t4, t3)) {
+        sp_521_proj_point_dbl_9(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_9(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_9(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(x, x, t5, p521_mod);
+        sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_9(t3, y, p521_mod);
+        sp_521_mont_sub_9(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_9(y, y, x, p521_mod);
+        sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(y, y, t5, p521_mod);
+        for (i = 0; i < 9; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_9_ctx {
@@ -53043,6 +53044,13 @@ typedef struct sp_521_proj_point_add_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_9_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -53074,252 +53082,149 @@ static int sp_521_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_9(ctx->t1, p521_mod, q->y);
-        sp_521_norm_9(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-            (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_9(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_9(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_9(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_9(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_9(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_9(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_9(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 9; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_9(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* t3 = t + 4*9;
-    sp_digit* t4 = t + 6*9;
-    sp_digit* t5 = t + 8*9;
-    sp_digit* t6 = t + 10*9;
-
-
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_9(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_9(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_9(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_9(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_9(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_9(t2) & sp_521_iszero_9(t4) & maskt) {
-            sp_521_proj_point_dbl_9(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_9(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_9(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_9(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_9(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_9(x, x, t5, p521_mod);
-            sp_521_mont_mul_9(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_9(t3, y, p521_mod);
-            sp_521_mont_sub_9(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_9(y, y, x, p521_mod);
-            sp_521_mont_mul_9(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_9(y, y, t5, p521_mod);
-
-            for (i = 0; i < 9; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -53372,7 +53277,7 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         sp_521_mont_sqr_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_9(t2, b, p521_mod);
         sp_521_mont_sub_9(x, x, t2, p521_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -53387,7 +53292,6 @@ static void sp_521_proj_point_dbl_n_store_9(sp_point_521* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_521_mont_mul_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_9(y, y, t1, p521_mod);
-
         /* Y = Y/2 */
         sp_521_div2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
@@ -53856,7 +53760,7 @@ static void sp_521_map_avx2_9(sp_point_521* r, const sp_point_521* p,
 
     /* x /= z^2 */
     sp_521_mont_mul_avx2_9(r->x, p->x, t2, p521_mod, p521_mp_mod);
-    XMEMSET(r->x + 9, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_avx2_9(r->x, p521_mod, p521_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_521_cmp_9(r->x, p521_mod);
@@ -53865,16 +53769,15 @@ static void sp_521_map_avx2_9(sp_point_521* r, const sp_point_521* p,
 
     /* y /= z^3 */
     sp_521_mont_mul_avx2_9(r->y, p->y, t1, p521_mod, p521_mp_mod);
-    XMEMSET(r->y + 9, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 9, 0, sizeof(sp_digit) * 9U);
     sp_521_mont_reduce_avx2_9(r->y, p521_mod, p521_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_521_cmp_9(r->y, p521_mod);
-    sp_521_cond_sub_avx2_9(r->y, r->y, p521_mod, ~(n >> 63));
+    sp_521_cond_sub_9(r->y, r->y, p521_mod, ~(n >> 63));
     sp_521_norm_9(r->y);
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #define sp_521_mont_add_avx2_9 sp_521_mont_add_9
@@ -53895,6 +53798,61 @@ extern void sp_521_div2_avx2_9(sp_digit* r, const sp_digit* a, const sp_digit* m
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_521_mont_sqr_avx2_9(t1, p->z, p521_mod, p521_mp_mod);
+    /* Z = Y * Z */
+    sp_521_mont_mul_avx2_9(z, p->y, p->z, p521_mod, p521_mp_mod);
+    /* Z = 2Z */
+    sp_521_mont_dbl_avx2_9(z, z, p521_mod);
+    /* T2 = X - T1 */
+    sp_521_mont_sub_avx2_9(t2, p->x, t1, p521_mod);
+    /* T1 = X + T1 */
+    sp_521_mont_add_avx2_9(t1, p->x, t1, p521_mod);
+    /* T2 = T1 * T2 */
+    sp_521_mont_mul_avx2_9(t2, t1, t2, p521_mod, p521_mp_mod);
+    /* T1 = 3T2 */
+    sp_521_mont_tpl_avx2_9(t1, t2, p521_mod);
+    /* Y = 2Y */
+    sp_521_mont_dbl_avx2_9(y, p->y, p521_mod);
+    /* Y = Y * Y */
+    sp_521_mont_sqr_avx2_9(y, y, p521_mod, p521_mp_mod);
+    /* T2 = Y * Y */
+    sp_521_mont_sqr_avx2_9(t2, y, p521_mod, p521_mp_mod);
+    /* T2 = T2/2 */
+    sp_521_div2_avx2_9(t2, t2, p521_mod);
+    /* Y = Y * X */
+    sp_521_mont_mul_avx2_9(y, y, p->x, p521_mod, p521_mp_mod);
+    /* X = T1 * T1 */
+    sp_521_mont_sqr_avx2_9(x, t1, p521_mod, p521_mp_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_avx2_9(x, x, y, p521_mod);
+    /* X = X - Y */
+    sp_521_mont_sub_avx2_9(x, x, y, p521_mod);
+    /* Y = Y - X */
+    sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod);
+    /* Y = Y * T1 */
+    sp_521_mont_mul_avx2_9(y, y, t1, p521_mod, p521_mp_mod);
+    /* Y = Y - T2 */
+    sp_521_mont_sub_avx2_9(y, y, t2, p521_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_dbl_avx2_9_ctx {
     int state;
@@ -53905,6 +53863,12 @@ typedef struct sp_521_proj_point_dbl_avx2_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_dbl_avx2_9_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r, const sp_point_521* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -54029,62 +53993,6 @@ static int sp_521_proj_point_dbl_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_521_proj_point_dbl_avx2_9(sp_point_521* r, const sp_point_521* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_521_mont_sqr_avx2_9(t1, p->z, p521_mod, p521_mp_mod);
-    /* Z = Y * Z */
-    sp_521_mont_mul_avx2_9(z, p->y, p->z, p521_mod, p521_mp_mod);
-    /* Z = 2Z */
-    sp_521_mont_dbl_avx2_9(z, z, p521_mod);
-    /* T2 = X - T1 */
-    sp_521_mont_sub_avx2_9(t2, p->x, t1, p521_mod);
-    /* T1 = X + T1 */
-    sp_521_mont_add_avx2_9(t1, p->x, t1, p521_mod);
-    /* T2 = T1 * T2 */
-    sp_521_mont_mul_avx2_9(t2, t1, t2, p521_mod, p521_mp_mod);
-    /* T1 = 3T2 */
-    sp_521_mont_tpl_avx2_9(t1, t2, p521_mod);
-    /* Y = 2Y */
-    sp_521_mont_dbl_avx2_9(y, p->y, p521_mod);
-    /* Y = Y * Y */
-    sp_521_mont_sqr_avx2_9(y, y, p521_mod, p521_mp_mod);
-    /* T2 = Y * Y */
-    sp_521_mont_sqr_avx2_9(t2, y, p521_mod, p521_mp_mod);
-    /* T2 = T2/2 */
-    sp_521_div2_avx2_9(t2, t2, p521_mod);
-    /* Y = Y * X */
-    sp_521_mont_mul_avx2_9(y, y, p->x, p521_mod, p521_mp_mod);
-    /* X = T1 * T1 */
-    sp_521_mont_sqr_avx2_9(x, t1, p521_mod, p521_mp_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_avx2_9(x, x, y, p521_mod);
-    /* X = X - Y */
-    sp_521_mont_sub_avx2_9(x, x, y, p521_mod);
-    /* Y = Y - X */
-    sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod);
-    /* Y = Y * T1 */
-    sp_521_mont_mul_avx2_9(y, y, t1, p521_mod, p521_mp_mod);
-    /* Y = Y - T2 */
-    sp_521_mont_sub_avx2_9(y, y, t2, p521_mod);
-}
-
 #define sp_521_mont_dbl_lower_avx2_9 sp_521_mont_dbl_avx2_9
 #define sp_521_mont_tpl_lower_avx2_9 sp_521_mont_tpl_avx2_9
 /* Double the Montgomery form projective point p a number of times.
@@ -54116,7 +54024,6 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i,
     /* W = Z^4 */
     sp_521_mont_sqr_avx2_9(w, z, p521_mod, p521_mp_mod);
     sp_521_mont_sqr_avx2_9(w, w, p521_mod, p521_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -54134,7 +54041,7 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i,
         sp_521_mont_sqr_avx2_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_avx2_9(t2, b, p521_mod);
         sp_521_mont_sub_avx2_9(x, x, t2, p521_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -54164,7 +54071,7 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i,
     sp_521_mont_sqr_avx2_9(x, a, p521_mod, p521_mp_mod);
     sp_521_mont_dbl_avx2_9(t2, b, p521_mod);
     sp_521_mont_sub_avx2_9(x, x, t2, p521_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod);
     sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod);
     /* Z = Z*Y */
@@ -54174,11 +54081,12 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_521_mont_mul_avx2_9(y, b, a, p521_mod, p521_mp_mod);
     sp_521_mont_sub_avx2_9(y, y, t1, p521_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_521_div2_avx2_9(y, y, p521_mod);
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -54186,6 +54094,81 @@ static void sp_521_proj_point_dbl_n_avx2_9(sp_point_521* p, int i,
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_521_proj_point_add_avx2_9(sp_point_521* r,
+        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*9;
+    sp_digit* t3 = t + 4*9;
+    sp_digit* t4 = t + 6*9;
+    sp_digit* t5 = t + 8*9;
+    sp_digit* t6 = t + 10*9;
+
+    /* U1 = X1*Z2^2 */
+    sp_521_mont_sqr_avx2_9(t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_avx2_9(t3, t1, q->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_avx2_9(t1, t1, p->x, p521_mod, p521_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_avx2_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_avx2_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_521_mont_mul_avx2_9(t3, t3, p->y, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(t2, t1) &
+            sp_521_cmp_equal_9(t4, t3)) {
+        sp_521_proj_point_dbl_avx2_9(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_521_mont_sub_avx2_9(t2, t2, t1, p521_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_avx2_9(t4, t4, t3, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_avx2_9(t5, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(y, t1, t5, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(t5, t5, t2, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_avx2_9(z, p->z, t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(z, z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_avx2_9(x, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_avx2_9(x, x, t5, p521_mod);
+        sp_521_mont_mul_avx2_9(t5, t5, t3, p521_mod, p521_mp_mod);
+        sp_521_mont_dbl_avx2_9(t3, y, p521_mod);
+        sp_521_mont_sub_avx2_9(x, x, t3, p521_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod);
+        sp_521_mont_mul_avx2_9(y, y, t4, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_avx2_9(y, y, t5, p521_mod);
+        for (i = 0; i < 9; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 9; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_521_proj_point_add_avx2_9_ctx {
@@ -54204,6 +54187,13 @@ typedef struct sp_521_proj_point_add_avx2_9_ctx {
     sp_digit* z;
 } sp_521_proj_point_add_avx2_9_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r,
     const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
@@ -54235,252 +54225,149 @@ static int sp_521_proj_point_add_avx2_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_521* r
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_521_sub_avx2_9(ctx->t1, p521_mod, q->y);
-        sp_521_norm_avx2_9(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_521_cmp_equal_avx2_9(p->x, q->x) & sp_521_cmp_equal_avx2_9(p->z, q->z) &
-            (sp_521_cmp_equal_avx2_9(p->y, q->y) | sp_521_cmp_equal_avx2_9(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_521_mont_sqr_avx2_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_521_proj_point_dbl_avx2_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_521_mont_mul_avx2_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_521_mont_mul_avx2_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_avx2_9(ctx->t1, q->z, p521_mod, p521_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_521_mont_sqr_avx2_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_521_mont_mul_avx2_9(ctx->t3, ctx->t1, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_521_mont_mul_avx2_9(ctx->t1, ctx->t1, p->x, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_avx2_9(ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_521_mont_mul_avx2_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_521_mont_mul_avx2_9(ctx->t4, ctx->t2, p->z, p521_mod, p521_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_521_mont_mul_avx2_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_521_mont_mul_avx2_9(ctx->t2, ctx->t2, q->x, p521_mod, p521_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_521_cmp_equal_9(ctx->t2, ctx->t1) &
+                sp_521_cmp_equal_9(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_521_proj_point_dbl_avx2_9(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_avx2_9(ctx->t3, ctx->t3, p->y, p521_mod, p521_mp_mod);
+        /* H = U2 - U1 */
+        sp_521_mont_sub_avx2_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_avx2_9(ctx->t4, ctx->t4, q->y, p521_mod, p521_mp_mod);
+        /* R = S2 - S1 */
+        sp_521_mont_sub_avx2_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_521_mont_sub_avx2_9(ctx->t2, ctx->t2, ctx->t1, p521_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_521_mont_sqr_avx2_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_521_mont_sub_avx2_9(ctx->t4, ctx->t4, ctx->t3, p521_mod);
+        sp_521_mont_mul_avx2_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_521_mont_sqr_avx2_9(ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_521_mont_mul_avx2_9(ctx->y, ctx->t1, ctx->t5, p521_mod, p521_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_521_mont_mul_avx2_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_521_mont_mul_avx2_9(ctx->z, p->z, ctx->t2, p521_mod, p521_mp_mod);
+        sp_521_mont_sqr_avx2_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_521_mont_mul_avx2_9(ctx->z, ctx->z, q->z, p521_mod, p521_mp_mod);
+        sp_521_mont_sub_avx2_9(ctx->x, ctx->x, ctx->t5, p521_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_521_mont_sqr_avx2_9(ctx->x, ctx->t4, p521_mod, p521_mp_mod);
+        sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_521_mont_sub_avx2_9(ctx->x, ctx->x, ctx->t5, p521_mod);
+        sp_521_mont_dbl_avx2_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_sub_avx2_9(ctx->x, ctx->x, ctx->t3, p521_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_521_mont_mul_avx2_9(ctx->t5, ctx->t5, ctx->t3, p521_mod, p521_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_521_mont_sub_lower_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_521_mont_dbl_avx2_9(ctx->t3, ctx->y, p521_mod);
+        sp_521_mont_mul_avx2_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_521_mont_sub_avx2_9(ctx->x, ctx->x, ctx->t3, p521_mod);
+        sp_521_mont_sub_avx2_9(ctx->y, ctx->y, ctx->t5, p521_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_521_mont_sub_lower_avx2_9(ctx->y, ctx->y, ctx->x, p521_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_521_mont_mul_avx2_9(ctx->y, ctx->y, ctx->t4, p521_mod, p521_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_521_mont_sub_avx2_9(ctx->y, ctx->y, ctx->t5, p521_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 9; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 9; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_521_proj_point_add_avx2_9(sp_point_521* r,
-        const sp_point_521* p, const sp_point_521* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*9;
-    sp_digit* t3 = t + 4*9;
-    sp_digit* t4 = t + 6*9;
-    sp_digit* t5 = t + 8*9;
-    sp_digit* t6 = t + 10*9;
-
-
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
-        sp_521_proj_point_dbl_avx2_9(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_521_mont_sqr_avx2_9(t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_avx2_9(t3, t1, q->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_avx2_9(t1, t1, p->x, p521_mod, p521_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_avx2_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_avx2_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_521_mont_mul_avx2_9(t3, t3, p->y, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod);
-        /* H = U2 - U1 */
-        sp_521_mont_sub_avx2_9(t2, t2, t1, p521_mod);
-        /* R = S2 - S1 */
-        sp_521_mont_sub_avx2_9(t4, t4, t3, p521_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_521_iszero_9(t2) & sp_521_iszero_9(t4) & maskt) {
-            sp_521_proj_point_dbl_avx2_9(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_521_mont_sqr_avx2_9(t5, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_avx2_9(y, t1, t5, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_avx2_9(t5, t5, t2, p521_mod, p521_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_521_mont_mul_avx2_9(z, p->z, t2, p521_mod, p521_mp_mod);
-            sp_521_mont_mul_avx2_9(z, z, q->z, p521_mod, p521_mp_mod);
-            sp_521_mont_sqr_avx2_9(x, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_avx2_9(x, x, t5, p521_mod);
-            sp_521_mont_mul_avx2_9(t5, t5, t3, p521_mod, p521_mp_mod);
-            sp_521_mont_dbl_avx2_9(t3, y, p521_mod);
-            sp_521_mont_sub_avx2_9(x, x, t3, p521_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_521_mont_sub_lower_avx2_9(y, y, x, p521_mod);
-            sp_521_mont_mul_avx2_9(y, y, t4, p521_mod, p521_mp_mod);
-            sp_521_mont_sub_avx2_9(y, y, t5, p521_mod);
-
-            for (i = 0; i < 9; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 9; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -54533,7 +54420,7 @@ static void sp_521_proj_point_dbl_n_store_avx2_9(sp_point_521* r,
         sp_521_mont_sqr_avx2_9(x, a, p521_mod, p521_mp_mod);
         sp_521_mont_dbl_avx2_9(t2, b, p521_mod);
         sp_521_mont_sub_avx2_9(x, x, t2, p521_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_521_mont_sub_lower_avx2_9(t2, b, x, p521_mod);
         sp_521_mont_dbl_lower_avx2_9(b, t2, p521_mod);
         /* Z = Z*Y */
@@ -54548,7 +54435,6 @@ static void sp_521_proj_point_dbl_n_store_avx2_9(sp_point_521* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_521_mont_mul_avx2_9(y, b, a, p521_mod, p521_mp_mod);
         sp_521_mont_sub_avx2_9(y, y, t1, p521_mod);
-
         /* Y = Y/2 */
         sp_521_div2_avx2_9(r[j].y, y, p521_mod);
         r[j].infinity = 0;
@@ -54797,8 +54683,8 @@ typedef struct sp_table_entry_521 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_9(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*9;
@@ -54807,12 +54693,17 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
     sp_digit* t5 = t + 8*9;
     sp_digit* t6 = t + 10*9;
 
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(p->x, t2) &
+            sp_521_cmp_equal_9(p->y, t4)) {
         sp_521_proj_point_dbl_9(r, p, t);
     }
     else {
@@ -54824,12 +54715,6 @@ static void sp_521_proj_point_add_qz1_9(sp_point_521* r, const sp_point_521* p,
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_9(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_9(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -55286,8 +55171,8 @@ static int sp_521_ecc_mulmod_9(sp_point_521* r, const sp_point_521* g, const sp_
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, const sp_point_521* p,
-        const sp_point_521* q, sp_digit* t)
+static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r,
+    const sp_point_521* p, const sp_point_521* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*9;
@@ -55296,12 +55181,17 @@ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, const sp_point_521
     sp_digit* t5 = t + 8*9;
     sp_digit* t6 = t + 10*9;
 
-    /* Check double */
-    (void)sp_521_sub_9(t1, p521_mod, q->y);
-    sp_521_norm_9(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_521_cmp_equal_9(p->x, q->x) & sp_521_cmp_equal_9(p->z, q->z) &
-        (sp_521_cmp_equal_9(p->y, q->y) | sp_521_cmp_equal_9(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_521_mont_sqr_avx2_9(t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_avx2_9(t4, t2, p->z, p521_mod, p521_mp_mod);
+    sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_521_cmp_equal_9(p->x, t2) &
+            sp_521_cmp_equal_9(p->y, t4)) {
         sp_521_proj_point_dbl_avx2_9(r, p, t);
     }
     else {
@@ -55313,12 +55203,6 @@ static void sp_521_proj_point_add_qz1_avx2_9(sp_point_521* r, const sp_point_521
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_521_mont_sqr_avx2_9(t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_avx2_9(t4, t2, p->z, p521_mod, p521_mp_mod);
-        sp_521_mont_mul_avx2_9(t2, t2, q->x, p521_mod, p521_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_521_mont_mul_avx2_9(t4, t4, q->y, p521_mod, p521_mp_mod);
         /* H = U2 - X1 */
         sp_521_mont_sub_avx2_9(t2, t2, p->x, p521_mod);
         /* R = S2 - Y1 */
@@ -91800,7 +91684,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 (sp_digit)0 - (sp_digit)(c >= 0));
             sp_521_norm_9(r);
 
-            if (sp_521_iszero_9(r) == 0) {
+            if (!sp_521_iszero_9(r)) {
                 /* x is modified in calculation of s. */
                 sp_521_from_mp(x, 9, priv);
                 /* s ptr == e ptr, e is modified in calculation of s. */
@@ -91814,7 +91698,7 @@ int sp_ecc_sign_521(const byte* hash, word32 hashLen, WC_RNG* rng,
                 err = sp_521_calc_s_9(s, r, k, x, e, tmp);
 
                 /* Check that signature is usable. */
-                if ((err == MP_OKAY) && (sp_521_iszero_9(s) == 0)) {
+                if ((err == MP_OKAY) && (!sp_521_iszero_9(s))) {
                     break;
                 }
             }
@@ -93912,7 +93796,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_16(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 16, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 16, 0, sizeof(sp_digit) * 16U);
     sp_1024_mont_reduce_16(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_16(r->x, p1024_mod);
@@ -93921,7 +93805,7 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_16(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 16, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 16, 0, sizeof(sp_digit) * 16U);
     sp_1024_mont_reduce_16(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_16(r->y, p1024_mod);
@@ -93930,7 +93814,6 @@ static void sp_1024_map_16(sp_point_1024* r, const sp_point_1024* p,
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #ifdef __cplusplus
@@ -93975,6 +93858,61 @@ extern void sp_1024_div2_16(sp_digit* r, const sp_digit* a, const sp_digit* m);
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*16;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_16(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_16(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_16(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_16(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_16(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_16(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_16(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_16(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_16(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_16(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_16(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_16(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_16(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_16(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_16(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_16_ctx {
     int state;
@@ -93985,6 +93923,12 @@ typedef struct sp_1024_proj_point_dbl_16_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_16_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -94109,62 +94053,6 @@ static int sp_1024_proj_point_dbl_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_16(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*16;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_16(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_16(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_16(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_16(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_16(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_16(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_16(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_16(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_16(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_16(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_16(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_16(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_16(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_16(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_16(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_16(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_16(y, y, t2, p1024_mod);
-}
-
 #define sp_1024_mont_dbl_lower_16 sp_1024_mont_dbl_16
 #define sp_1024_mont_tpl_lower_16 sp_1024_mont_tpl_16
 /* Double the Montgomery form projective point p a number of times.
@@ -94196,7 +94084,6 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_16(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_16(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -94214,7 +94101,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
         sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_16(t2, b, p1024_mod);
         sp_1024_mont_sub_16(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_16(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -94244,7 +94131,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_16(t2, b, p1024_mod);
     sp_1024_mont_sub_16(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_16(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -94254,7 +94141,7 @@ static void sp_1024_proj_point_dbl_n_16(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_16(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_16(y, y, p1024_mod);
 }
@@ -94295,6 +94182,7 @@ static int sp_1024_iszero_16(const sp_digit* a)
             a[8] | a[9] | a[10] | a[11] | a[12] | a[13] | a[14] | a[15]) == 0;
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -94302,6 +94190,81 @@ static int sp_1024_iszero_16(const sp_digit* a)
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_16(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*16;
+    sp_digit* t3 = t + 4*16;
+    sp_digit* t4 = t + 6*16;
+    sp_digit* t5 = t + 8*16;
+    sp_digit* t6 = t + 10*16;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_16(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_16(t2, t1) &
+            sp_1024_cmp_equal_16(t4, t3)) {
+        sp_1024_proj_point_dbl_16(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_16(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_16(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_16(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_16(t3, y, p1024_mod);
+        sp_1024_mont_sub_16(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
+        sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_16(y, y, t5, p1024_mod);
+        for (i = 0; i < 16; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 16; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 16; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_16_ctx {
@@ -94320,6 +94283,13 @@ typedef struct sp_1024_proj_point_add_16_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_16_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -94351,252 +94321,149 @@ static int sp_1024_proj_point_add_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_16(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_16(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-            (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_16(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_16_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_16(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_16(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_16(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_16(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_16(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_16(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_16(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_16(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_16(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_16(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_16(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_16(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_16(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_16(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_16(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_16(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_16(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_16(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 16; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 16; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 16; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_16(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*16;
-    sp_digit* t3 = t + 4*16;
-    sp_digit* t4 = t + 6*16;
-    sp_digit* t5 = t + 8*16;
-    sp_digit* t6 = t + 10*16;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_16(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-        (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_16(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_16(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_16(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_16(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_16(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_16(t2) & sp_1024_iszero_16(t4) & maskt) {
-            sp_1024_proj_point_dbl_16(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_16(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_16(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_16(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_16(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_16(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_16(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_16(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_16(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_16(t3, y, p1024_mod);
-            sp_1024_mont_sub_16(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_16(y, y, x, p1024_mod);
-            sp_1024_mont_mul_16(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_16(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 16; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 16; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 16; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -94649,7 +94516,7 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r,
         sp_1024_mont_sqr_16(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_16(t2, b, p1024_mod);
         sp_1024_mont_sub_16(x, x, t2, p1024_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_16(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_16(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -94664,7 +94531,6 @@ static void sp_1024_proj_point_dbl_n_store_16(sp_point_1024* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_1024_mont_mul_16(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_16(y, y, t1, p1024_mod);
-
         /* Y = Y/2 */
         sp_1024_div2_16(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
@@ -95079,7 +94945,7 @@ static void sp_1024_map_avx2_16(sp_point_1024* r, const sp_point_1024* p,
 
     /* x /= z^2 */
     sp_1024_mont_mul_avx2_16(r->x, p->x, t2, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->x + 16, 0, sizeof(r->x) / 2U);
+    XMEMSET(r->x + 16, 0, sizeof(sp_digit) * 16U);
     sp_1024_mont_reduce_avx2_16(r->x, p1024_mod, p1024_mp_mod);
     /* Reduce x to less than modulus */
     n = sp_1024_cmp_16(r->x, p1024_mod);
@@ -95088,16 +94954,15 @@ static void sp_1024_map_avx2_16(sp_point_1024* r, const sp_point_1024* p,
 
     /* y /= z^3 */
     sp_1024_mont_mul_avx2_16(r->y, p->y, t1, p1024_mod, p1024_mp_mod);
-    XMEMSET(r->y + 16, 0, sizeof(r->y) / 2U);
+    XMEMSET(r->y + 16, 0, sizeof(sp_digit) * 16U);
     sp_1024_mont_reduce_avx2_16(r->y, p1024_mod, p1024_mp_mod);
     /* Reduce y to less than modulus */
     n = sp_1024_cmp_16(r->y, p1024_mod);
-    sp_1024_cond_sub_avx2_16(r->y, r->y, p1024_mod, ~(n >> 63));
+    sp_1024_cond_sub_16(r->y, r->y, p1024_mod, ~(n >> 63));
     sp_1024_norm_16(r->y);
 
     XMEMSET(r->z, 0, sizeof(r->z) / 2);
     r->z[0] = 1;
-
 }
 
 #ifdef __cplusplus
@@ -95142,6 +95007,61 @@ extern void sp_1024_div2_avx2_16(sp_digit* r, const sp_digit* a, const sp_digit*
  * p  Point to double.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024* p,
+    sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*16;
+    sp_digit* x;
+    sp_digit* y;
+    sp_digit* z;
+
+    x = r->x;
+    y = r->y;
+    z = r->z;
+    /* Put infinity into result. */
+    if (r != p) {
+        r->infinity = p->infinity;
+    }
+
+    /* T1 = Z * Z */
+    sp_1024_mont_sqr_avx2_16(t1, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = Y * Z */
+    sp_1024_mont_mul_avx2_16(z, p->y, p->z, p1024_mod, p1024_mp_mod);
+    /* Z = 2Z */
+    sp_1024_mont_dbl_avx2_16(z, z, p1024_mod);
+    /* T2 = X - T1 */
+    sp_1024_mont_sub_avx2_16(t2, p->x, t1, p1024_mod);
+    /* T1 = X + T1 */
+    sp_1024_mont_add_avx2_16(t1, p->x, t1, p1024_mod);
+    /* T2 = T1 * T2 */
+    sp_1024_mont_mul_avx2_16(t2, t1, t2, p1024_mod, p1024_mp_mod);
+    /* T1 = 3T2 */
+    sp_1024_mont_tpl_avx2_16(t1, t2, p1024_mod);
+    /* Y = 2Y */
+    sp_1024_mont_dbl_avx2_16(y, p->y, p1024_mod);
+    /* Y = Y * Y */
+    sp_1024_mont_sqr_avx2_16(y, y, p1024_mod, p1024_mp_mod);
+    /* T2 = Y * Y */
+    sp_1024_mont_sqr_avx2_16(t2, y, p1024_mod, p1024_mp_mod);
+    /* T2 = T2/2 */
+    sp_1024_div2_avx2_16(t2, t2, p1024_mod);
+    /* Y = Y * X */
+    sp_1024_mont_mul_avx2_16(y, y, p->x, p1024_mod, p1024_mp_mod);
+    /* X = T1 * T1 */
+    sp_1024_mont_sqr_avx2_16(x, t1, p1024_mod, p1024_mp_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_avx2_16(x, x, y, p1024_mod);
+    /* X = X - Y */
+    sp_1024_mont_sub_avx2_16(x, x, y, p1024_mod);
+    /* Y = Y - X */
+    sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod);
+    /* Y = Y * T1 */
+    sp_1024_mont_mul_avx2_16(y, y, t1, p1024_mod, p1024_mp_mod);
+    /* Y = Y - T2 */
+    sp_1024_mont_sub_avx2_16(y, y, t2, p1024_mod);
+}
+
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_dbl_avx2_16_ctx {
     int state;
@@ -95152,6 +95072,12 @@ typedef struct sp_1024_proj_point_dbl_avx2_16_ctx {
     sp_digit* z;
 } sp_1024_proj_point_dbl_avx2_16_ctx;
 
+/* Double the Montgomery form projective point p.
+ *
+ * r  Result of doubling point.
+ * p  Point to double.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r, const sp_point_1024* p, sp_digit* t)
 {
     int err = FP_WOULDBLOCK;
@@ -95276,62 +95202,6 @@ static int sp_1024_proj_point_dbl_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
-
-static void sp_1024_proj_point_dbl_avx2_16(sp_point_1024* r, const sp_point_1024* p,
-    sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*16;
-    sp_digit* x;
-    sp_digit* y;
-    sp_digit* z;
-
-    x = r->x;
-    y = r->y;
-    z = r->z;
-    /* Put infinity into result. */
-    if (r != p) {
-        r->infinity = p->infinity;
-    }
-
-    /* T1 = Z * Z */
-    sp_1024_mont_sqr_avx2_16(t1, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = Y * Z */
-    sp_1024_mont_mul_avx2_16(z, p->y, p->z, p1024_mod, p1024_mp_mod);
-    /* Z = 2Z */
-    sp_1024_mont_dbl_avx2_16(z, z, p1024_mod);
-    /* T2 = X - T1 */
-    sp_1024_mont_sub_avx2_16(t2, p->x, t1, p1024_mod);
-    /* T1 = X + T1 */
-    sp_1024_mont_add_avx2_16(t1, p->x, t1, p1024_mod);
-    /* T2 = T1 * T2 */
-    sp_1024_mont_mul_avx2_16(t2, t1, t2, p1024_mod, p1024_mp_mod);
-    /* T1 = 3T2 */
-    sp_1024_mont_tpl_avx2_16(t1, t2, p1024_mod);
-    /* Y = 2Y */
-    sp_1024_mont_dbl_avx2_16(y, p->y, p1024_mod);
-    /* Y = Y * Y */
-    sp_1024_mont_sqr_avx2_16(y, y, p1024_mod, p1024_mp_mod);
-    /* T2 = Y * Y */
-    sp_1024_mont_sqr_avx2_16(t2, y, p1024_mod, p1024_mp_mod);
-    /* T2 = T2/2 */
-    sp_1024_div2_avx2_16(t2, t2, p1024_mod);
-    /* Y = Y * X */
-    sp_1024_mont_mul_avx2_16(y, y, p->x, p1024_mod, p1024_mp_mod);
-    /* X = T1 * T1 */
-    sp_1024_mont_sqr_avx2_16(x, t1, p1024_mod, p1024_mp_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_avx2_16(x, x, y, p1024_mod);
-    /* X = X - Y */
-    sp_1024_mont_sub_avx2_16(x, x, y, p1024_mod);
-    /* Y = Y - X */
-    sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod);
-    /* Y = Y * T1 */
-    sp_1024_mont_mul_avx2_16(y, y, t1, p1024_mod, p1024_mp_mod);
-    /* Y = Y - T2 */
-    sp_1024_mont_sub_avx2_16(y, y, t2, p1024_mod);
-}
-
 #define sp_1024_mont_dbl_lower_avx2_16 sp_1024_mont_dbl_avx2_16
 #define sp_1024_mont_tpl_lower_avx2_16 sp_1024_mont_tpl_avx2_16
 /* Double the Montgomery form projective point p a number of times.
@@ -95363,7 +95233,6 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i,
     /* W = Z^4 */
     sp_1024_mont_sqr_avx2_16(w, z, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sqr_avx2_16(w, w, p1024_mod, p1024_mp_mod);
-
 #ifndef WOLFSSL_SP_SMALL
     while (--n > 0)
 #else
@@ -95381,7 +95250,7 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i,
         sp_1024_mont_sqr_avx2_16(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod);
         sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod);
-        /*   b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -95411,7 +95280,7 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i,
     sp_1024_mont_sqr_avx2_16(x, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod);
     sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod);
-    /*   b = 2.(B - X) */
+    /* B = 2.(B - X) */
     sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod);
     sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod);
     /* Z = Z*Y */
@@ -95421,11 +95290,12 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i,
     /* y = 2*A*(B - X) - Y^4 */
     sp_1024_mont_mul_avx2_16(y, b, a, p1024_mod, p1024_mp_mod);
     sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod);
-#endif
+#endif /* WOLFSSL_SP_SMALL */
     /* Y = Y/2 */
     sp_1024_div2_avx2_16(y, y, p1024_mod);
 }
 
+
 /* Add two Montgomery form projective points.
  *
  * r  Result of addition.
@@ -95433,6 +95303,81 @@ static void sp_1024_proj_point_dbl_n_avx2_16(sp_point_1024* p, int i,
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
+static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r,
+        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
+{
+    sp_digit* t1 = t;
+    sp_digit* t2 = t + 2*16;
+    sp_digit* t3 = t + 4*16;
+    sp_digit* t4 = t + 6*16;
+    sp_digit* t5 = t + 8*16;
+    sp_digit* t6 = t + 10*16;
+
+    /* U1 = X1*Z2^2 */
+    sp_1024_mont_sqr_avx2_16(t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_avx2_16(t3, t1, q->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_avx2_16(t1, t1, p->x, p1024_mod, p1024_mp_mod);
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_avx2_16(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_avx2_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S1 = Y1*Z2^3 */
+    sp_1024_mont_mul_avx2_16(t3, t3, p->y, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    /* Check double */
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_16(t2, t1) &
+            sp_1024_cmp_equal_16(t4, t3)) {
+        sp_1024_proj_point_dbl_avx2_16(r, p, t);
+    }
+    else {
+        sp_digit maskp;
+        sp_digit maskq;
+        sp_digit maskt;
+        sp_digit* x = t6;
+        sp_digit* y = t1;
+        sp_digit* z = t2;
+        int i;
+
+        maskp = 0 - (q->infinity & (!p->infinity));
+        maskq = 0 - (p->infinity & (!q->infinity));
+        maskt = ~(maskp | maskq);
+
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_avx2_16(t2, t2, t1, p1024_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_avx2_16(t4, t4, t3, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_avx2_16(t5, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(y, t1, t5, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(t5, t5, t2, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_avx2_16(z, p->z, t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(z, z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_avx2_16(x, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_avx2_16(x, x, t5, p1024_mod);
+        sp_1024_mont_mul_avx2_16(t5, t5, t3, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_dbl_avx2_16(t3, y, p1024_mod);
+        sp_1024_mont_sub_avx2_16(x, x, t3, p1024_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod);
+        sp_1024_mont_mul_avx2_16(y, y, t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_avx2_16(y, y, t5, p1024_mod);
+        for (i = 0; i < 16; i++) {
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (x[i] & maskt);
+        }
+        for (i = 0; i < 16; i++) {
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (y[i] & maskt);
+        }
+        for (i = 0; i < 16; i++) {
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (z[i] & maskt);
+        }
+        r->z[0] |= p->infinity & q->infinity;
+        r->infinity = p->infinity & q->infinity;
+    }
+}
 
 #ifdef WOLFSSL_SP_NONBLOCK
 typedef struct sp_1024_proj_point_add_avx2_16_ctx {
@@ -95451,6 +95396,13 @@ typedef struct sp_1024_proj_point_add_avx2_16_ctx {
     sp_digit* z;
 } sp_1024_proj_point_add_avx2_16_ctx;
 
+/* Add two Montgomery form projective points.
+ *
+ * r  Result of addition.
+ * p  First point to add.
+ * q  Second point to add.
+ * t  Temporary ordinate data.
+ */
 static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024* r,
     const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
@@ -95482,252 +95434,149 @@ static int sp_1024_proj_point_add_avx2_16_nb(sp_ecc_ctx_t* sp_ctx, sp_point_1024
         ctx->state = 1;
         break;
     case 1:
-        /* Check double */
-        (void)sp_1024_sub_avx2_16(ctx->t1, p1024_mod, q->y);
-        sp_1024_norm_avx2_16(ctx->t1);
-        if ((~p->infinity & ~q->infinity &
-            sp_1024_cmp_equal_avx2_16(p->x, q->x) & sp_1024_cmp_equal_avx2_16(p->z, q->z) &
-            (sp_1024_cmp_equal_avx2_16(p->y, q->y) | sp_1024_cmp_equal_avx2_16(p->y, ctx->t1))) != 0)
-        {
-            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
-            ctx->state = 2;
-        }
-        else {
-            ctx->state = 3;
-        }
+        /* U1 = X1*Z2^2 */
+        sp_1024_mont_sqr_avx2_16(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 2;
         break;
     case 2:
-        err = sp_1024_proj_point_dbl_avx2_16_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, r, p, t);
-        if (err == MP_OKAY)
-            ctx->state = 27; /* done */
+        sp_1024_mont_mul_avx2_16(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        ctx->state = 3;
         break;
     case 3:
-    {
+        sp_1024_mont_mul_avx2_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
         ctx->state = 4;
         break;
-    }
     case 4:
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_avx2_16(ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        /* U2 = X2*Z1^2 */
+        sp_1024_mont_sqr_avx2_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 5;
         break;
     case 5:
-        sp_1024_mont_mul_avx2_16(ctx->t3, ctx->t1, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
         ctx->state = 6;
         break;
     case 6:
-        sp_1024_mont_mul_avx2_16(ctx->t1, ctx->t1, p->x, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
         ctx->state = 7;
         break;
     case 7:
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_avx2_16(ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S1 = Y1*Z2^3 */
+        sp_1024_mont_mul_avx2_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
         ctx->state = 8;
         break;
     case 8:
-        sp_1024_mont_mul_avx2_16(ctx->t4, ctx->t2, p->z, p1024_mod, p1024_mp_mod);
+        /* S2 = Y2*Z1^3 */
+        sp_1024_mont_mul_avx2_16(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
         ctx->state = 9;
         break;
     case 9:
-        sp_1024_mont_mul_avx2_16(ctx->t2, ctx->t2, q->x, p1024_mod, p1024_mp_mod);
-        ctx->state = 10;
+        /* Check double */
+        if ((~p->infinity) & (~q->infinity) &
+                sp_1024_cmp_equal_16(ctx->t2, ctx->t1) &
+                sp_1024_cmp_equal_16(ctx->t4, ctx->t3)) {
+            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
+            sp_1024_proj_point_dbl_avx2_16(r, p, t);
+            ctx->state = 25;
+        }
+        else {
+            ctx->state = 10;
+        }
         break;
     case 10:
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_avx2_16(ctx->t3, ctx->t3, p->y, p1024_mod, p1024_mp_mod);
+        /* H = U2 - U1 */
+        sp_1024_mont_sub_avx2_16(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
         ctx->state = 11;
         break;
     case 11:
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_avx2_16(ctx->t4, ctx->t4, q->y, p1024_mod, p1024_mp_mod);
+        /* R = S2 - S1 */
+        sp_1024_mont_sub_avx2_16(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
         ctx->state = 12;
         break;
     case 12:
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_avx2_16(ctx->t2, ctx->t2, ctx->t1, p1024_mod);
+        /* X3 = R^2 - H^3 - 2*U1*H^2 */
+        sp_1024_mont_sqr_avx2_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 13;
         break;
     case 13:
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_avx2_16(ctx->t4, ctx->t4, ctx->t3, p1024_mod);
+        sp_1024_mont_mul_avx2_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
         ctx->state = 14;
         break;
     case 14:
-        /* X3 = R^2 - H^3 - 2*U1*H^2 */
-        sp_1024_mont_sqr_avx2_16(ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 15;
         break;
     case 15:
-        sp_1024_mont_mul_avx2_16(ctx->y, ctx->t1, ctx->t5, p1024_mod, p1024_mp_mod);
+        /* Z3 = H*Z1*Z2 */
+        sp_1024_mont_mul_avx2_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
         ctx->state = 16;
         break;
     case 16:
-        sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
         ctx->state = 17;
         break;
     case 17:
-        /* Z3 = H*Z1*Z2 */
-        sp_1024_mont_mul_avx2_16(ctx->z, p->z, ctx->t2, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sqr_avx2_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 18;
         break;
     case 18:
-        sp_1024_mont_mul_avx2_16(ctx->z, ctx->z, q->z, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_sub_avx2_16(ctx->x, ctx->x, ctx->t5, p1024_mod);
         ctx->state = 19;
         break;
     case 19:
-        sp_1024_mont_sqr_avx2_16(ctx->x, ctx->t4, p1024_mod, p1024_mp_mod);
+        sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
         ctx->state = 20;
         break;
     case 20:
-        sp_1024_mont_sub_avx2_16(ctx->x, ctx->x, ctx->t5, p1024_mod);
+        sp_1024_mont_dbl_avx2_16(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_sub_avx2_16(ctx->x, ctx->x, ctx->t3, p1024_mod);
         ctx->state = 21;
         break;
     case 21:
-        sp_1024_mont_mul_avx2_16(ctx->t5, ctx->t5, ctx->t3, p1024_mod, p1024_mp_mod);
+        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
+        sp_1024_mont_sub_lower_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod);
         ctx->state = 22;
         break;
     case 22:
-        sp_1024_mont_dbl_avx2_16(ctx->t3, ctx->y, p1024_mod);
+        sp_1024_mont_mul_avx2_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
         ctx->state = 23;
         break;
     case 23:
-        sp_1024_mont_sub_avx2_16(ctx->x, ctx->x, ctx->t3, p1024_mod);
+        sp_1024_mont_sub_avx2_16(ctx->y, ctx->y, ctx->t5, p1024_mod);
         ctx->state = 24;
         break;
     case 24:
-        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-        sp_1024_mont_sub_lower_avx2_16(ctx->y, ctx->y, ctx->x, p1024_mod);
-        ctx->state = 25;
-        break;
-    case 25:
-        sp_1024_mont_mul_avx2_16(ctx->y, ctx->y, ctx->t4, p1024_mod, p1024_mp_mod);
-        ctx->state = 26;
-        break;
-    case 26:
-        sp_1024_mont_sub_avx2_16(ctx->y, ctx->y, ctx->t5, p1024_mod);
-        ctx->state = 27;
-        /* fall-through */
-    case 27:
     {
         int i;
         sp_digit maskp = 0 - (q->infinity & (!p->infinity));
         sp_digit maskq = 0 - (p->infinity & (!q->infinity));
         sp_digit maskt = ~(maskp | maskq);
+
         for (i = 0; i < 16; i++) {
-            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                      (ctx->x[i] & maskt);
+            r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) | (ctx->x[i] & maskt);
         }
         for (i = 0; i < 16; i++) {
-            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                      (ctx->y[i] & maskt);
+            r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) | (ctx->y[i] & maskt);
         }
         for (i = 0; i < 16; i++) {
-            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                      (ctx->z[i] & maskt);
+            r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) | (ctx->z[i] & maskt);
         }
         r->z[0] |= p->infinity & q->infinity;
         r->infinity = p->infinity & q->infinity;
-
-        err = MP_OKAY;
+        ctx->state = 25;
         break;
     }
+    case 25:
+        err = MP_OKAY;
+        break;
     }
 
-    if (err == MP_OKAY && ctx->state != 27) {
+    if (err == MP_OKAY && ctx->state != 25) {
         err = FP_WOULDBLOCK;
     }
     return err;
 }
 #endif /* WOLFSSL_SP_NONBLOCK */
 
-static void sp_1024_proj_point_add_avx2_16(sp_point_1024* r,
-        const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
-{
-    sp_digit* t1 = t;
-    sp_digit* t2 = t + 2*16;
-    sp_digit* t3 = t + 4*16;
-    sp_digit* t4 = t + 6*16;
-    sp_digit* t5 = t + 8*16;
-    sp_digit* t6 = t + 10*16;
-
-
-    /* Check double */
-    (void)sp_1024_mont_sub_avx2_16(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_16(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-        (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, t1))) != 0) {
-        sp_1024_proj_point_dbl_avx2_16(r, p, t);
-    }
-    else {
-        sp_digit maskp;
-        sp_digit maskq;
-        sp_digit maskt;
-        sp_digit* x = t6;
-        sp_digit* y = t1;
-        sp_digit* z = t2;
-        int i;
-
-        maskp = 0 - (q->infinity & (!p->infinity));
-        maskq = 0 - (p->infinity & (!q->infinity));
-        maskt = ~(maskp | maskq);
-
-        /* U1 = X1*Z2^2 */
-        sp_1024_mont_sqr_avx2_16(t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_avx2_16(t3, t1, q->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_avx2_16(t1, t1, p->x, p1024_mod, p1024_mp_mod);
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_avx2_16(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_avx2_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S1 = Y1*Z2^3 */
-        sp_1024_mont_mul_avx2_16(t3, t3, p->y, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
-        /* H = U2 - U1 */
-        sp_1024_mont_sub_avx2_16(t2, t2, t1, p1024_mod);
-        /* R = S2 - S1 */
-        sp_1024_mont_sub_avx2_16(t4, t4, t3, p1024_mod);
-        if (~p->infinity & ~q->infinity &
-            sp_1024_iszero_16(t2) & sp_1024_iszero_16(t4) & maskt) {
-            sp_1024_proj_point_dbl_avx2_16(r, p, t);
-        }
-        else {
-            /* X3 = R^2 - H^3 - 2*U1*H^2 */
-            sp_1024_mont_sqr_avx2_16(t5, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_avx2_16(y, t1, t5, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_avx2_16(t5, t5, t2, p1024_mod, p1024_mp_mod);
-            /* Z3 = H*Z1*Z2 */
-            sp_1024_mont_mul_avx2_16(z, p->z, t2, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_mul_avx2_16(z, z, q->z, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sqr_avx2_16(x, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_avx2_16(x, x, t5, p1024_mod);
-            sp_1024_mont_mul_avx2_16(t5, t5, t3, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_dbl_avx2_16(t3, y, p1024_mod);
-            sp_1024_mont_sub_avx2_16(x, x, t3, p1024_mod);
-            /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-            sp_1024_mont_sub_lower_avx2_16(y, y, x, p1024_mod);
-            sp_1024_mont_mul_avx2_16(y, y, t4, p1024_mod, p1024_mp_mod);
-            sp_1024_mont_sub_avx2_16(y, y, t5, p1024_mod);
-
-            for (i = 0; i < 16; i++) {
-                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
-                          (x[i] & maskt);
-            }
-            for (i = 0; i < 16; i++) {
-                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
-                          (y[i] & maskt);
-            }
-            for (i = 0; i < 16; i++) {
-                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
-                          (z[i] & maskt);
-            }
-            r->z[0] |= p->infinity & q->infinity;
-            r->infinity = p->infinity & q->infinity;
-        }
-    }
-}
-
 /* Double the Montgomery form projective point p a number of times.
  *
  * r  Result of repeated doubling of point.
@@ -95780,7 +95629,7 @@ static void sp_1024_proj_point_dbl_n_store_avx2_16(sp_point_1024* r,
         sp_1024_mont_sqr_avx2_16(x, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_dbl_avx2_16(t2, b, p1024_mod);
         sp_1024_mont_sub_avx2_16(x, x, t2, p1024_mod);
-        /*  b = 2.(B - X) */
+        /* B = 2.(B - X) */
         sp_1024_mont_sub_lower_avx2_16(t2, b, x, p1024_mod);
         sp_1024_mont_dbl_lower_avx2_16(b, t2, p1024_mod);
         /* Z = Z*Y */
@@ -95795,7 +95644,6 @@ static void sp_1024_proj_point_dbl_n_store_avx2_16(sp_point_1024* r,
         /* y = 2*A*(B - X) - Y^4 */
         sp_1024_mont_mul_avx2_16(y, b, a, p1024_mod, p1024_mp_mod);
         sp_1024_mont_sub_avx2_16(y, y, t1, p1024_mod);
-
         /* Y = Y/2 */
         sp_1024_div2_avx2_16(r[j].y, y, p1024_mod);
         r[j].infinity = 0;
@@ -96048,8 +95896,8 @@ typedef struct sp_table_entry_1024 {
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*16;
@@ -96058,12 +95906,17 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024*
     sp_digit* t5 = t + 8*16;
     sp_digit* t6 = t + 10*16;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_16(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_16(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-        (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_16(p->x, t2) &
+            sp_1024_cmp_equal_16(p->y, t4)) {
         sp_1024_proj_point_dbl_16(r, p, t);
     }
     else {
@@ -96075,12 +95928,6 @@ static void sp_1024_proj_point_add_qz1_16(sp_point_1024* r, const sp_point_1024*
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_16(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_16(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */
@@ -96506,8 +96353,8 @@ static int sp_1024_ecc_mulmod_16(sp_point_1024* r, const sp_point_1024* g, const
  * q  Second point to add.
  * t  Temporary ordinate data.
  */
-static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, const sp_point_1024* p,
-        const sp_point_1024* q, sp_digit* t)
+static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r,
+    const sp_point_1024* p, const sp_point_1024* q, sp_digit* t)
 {
     sp_digit* t1 = t;
     sp_digit* t2 = t + 2*16;
@@ -96516,12 +96363,17 @@ static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, const sp_point_
     sp_digit* t5 = t + 8*16;
     sp_digit* t6 = t + 10*16;
 
-    /* Check double */
-    (void)sp_1024_mont_sub_avx2_16(t1, p1024_mod, q->y, p1024_mod);
-    sp_1024_norm_16(t1);
-    if ((~p->infinity & ~q->infinity &
-        sp_1024_cmp_equal_16(p->x, q->x) & sp_1024_cmp_equal_16(p->z, q->z) &
-        (sp_1024_cmp_equal_16(p->y, q->y) | sp_1024_cmp_equal_16(p->y, t1))) != 0) {
+    /* Calculate values to subtract from P->x and P->y. */
+    /* U2 = X2*Z1^2 */
+    sp_1024_mont_sqr_avx2_16(t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_avx2_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
+    sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
+    /* S2 = Y2*Z1^3 */
+    sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
+
+    if ((~p->infinity) & (~q->infinity) &
+            sp_1024_cmp_equal_16(p->x, t2) &
+            sp_1024_cmp_equal_16(p->y, t4)) {
         sp_1024_proj_point_dbl_avx2_16(r, p, t);
     }
     else {
@@ -96533,12 +96385,6 @@ static void sp_1024_proj_point_add_qz1_avx2_16(sp_point_1024* r, const sp_point_
         sp_digit* z = t6;
         int i;
 
-        /* U2 = X2*Z1^2 */
-        sp_1024_mont_sqr_avx2_16(t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_avx2_16(t4, t2, p->z, p1024_mod, p1024_mp_mod);
-        sp_1024_mont_mul_avx2_16(t2, t2, q->x, p1024_mod, p1024_mp_mod);
-        /* S2 = Y2*Z1^3 */
-        sp_1024_mont_mul_avx2_16(t4, t4, q->y, p1024_mod, p1024_mp_mod);
         /* H = U2 - X1 */
         sp_1024_mont_sub_avx2_16(t2, t2, p->x, p1024_mod);
         /* R = S2 - Y1 */