From 963dfc1acb10c406888433329570b0b8fae4d2e4 Mon Sep 17 00:00:00 2001
From: Pi Lanningham <pi.lanningham@gmail.com>
Date: Wed, 10 Jan 2024 18:04:18 -0500
Subject: [PATCH 1/3] Resolve SSW-303

Switches our recursive implementation of do_2_exp with a heavily
optimized math.pow2 from the aiken standard library.

This version of the module didn't exist at the time, but uses a really
clever trick to optimize. For the first 8 powers of 2, it uses a lookup
into a bytestring to find the result, and only recurses in big leaps
above 2^8.

In our benchmarks, this brings the number of orders per batch from 32 to
35!!
---
 lib/calculation/shared.ak | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/lib/calculation/shared.ak b/lib/calculation/shared.ak
index dcc7abc..73cd25c 100644
--- a/lib/calculation/shared.ak
+++ b/lib/calculation/shared.ak
@@ -1,6 +1,7 @@
 //// Shared types and functions across all pool calculations
 
 use aiken/builtin
+use aiken/math
 use shared.{SingletonValue}
 
 /// An interim pool state
@@ -65,7 +66,7 @@ fn unsafe_fast_index_with_tail(inputs: List<a>, idx: Int) -> List<a> {
 
 pub fn check_and_set_unique(uniqueness_flags: Int, index: Int) -> Int {
   expect index >= 0
-  let bit = do_2_exp(index)
+  let bit = math.pow2(index)
   let bit_shifted = 2 * bit
 
   let flag_set = uniqueness_flags + bit
@@ -73,11 +74,3 @@ pub fn check_and_set_unique(uniqueness_flags: Int, index: Int) -> Int {
   expect flag_set % bit_shifted > uniqueness_flags % bit_shifted
   flag_set
 }
-
-pub fn do_2_exp(n: Int) -> Int {
-  if n <= 0 {
-    1
-  } else {
-    2 * do_2_exp(n - 1)
-  }
-}

From dd08e96ac227a6b7b4e41151d5e40f93fcbab572 Mon Sep 17 00:00:00 2001
From: Pi Lanningham <pi.lanningham@gmail.com>
Date: Mon, 15 Jan 2024 16:00:03 -0500
Subject: [PATCH 2/3] Use an even more hand-optimized version of pow2

TxPipe did some benchmarking and found that unrolling the loop for our typical order sizes helps even more
---
 lib/calculation/shared.ak | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/lib/calculation/shared.ak b/lib/calculation/shared.ak
index 73cd25c..943759e 100644
--- a/lib/calculation/shared.ak
+++ b/lib/calculation/shared.ak
@@ -66,7 +66,7 @@ fn unsafe_fast_index_with_tail(inputs: List<a>, idx: Int) -> List<a> {
 
 pub fn check_and_set_unique(uniqueness_flags: Int, index: Int) -> Int {
   expect index >= 0
-  let bit = math.pow2(index)
+  let bit = small_pow2(index)
   let bit_shifted = 2 * bit
 
   let flag_set = uniqueness_flags + bit
@@ -74,3 +74,32 @@ pub fn check_and_set_unique(uniqueness_flags: Int, index: Int) -> Int {
   expect flag_set % bit_shifted > uniqueness_flags % bit_shifted
   flag_set
 }
+
+/// This is a version of pow2 that's optimized for small batch sizes
+/// It performs a few more granular loop-unrolls, converging on the small lookup index faster
+/// This was presented by TxPipe, and squeezes out one extra escrow over math.pow2 for our typical order sizes
+pub fn small_pow2(index)  -> Int {
+  let a = #[1, 2, 4, 8, 16, 32, 64, 128]
+  if e < 8 {
+    builtin.index_bytearray(a, e)
+  } else if e < 16 {
+    // 2^8 * recurse
+    256 * builtin.index_bytearray(a, e - 8)
+  } else if e < 24 {
+    // 2^16 * recurse
+    65536 * builtin.index_bytearray(a, e - 16)
+  } else if e < 32 {
+    // 2^24 * recurse
+    16777216 * builtin.index_bytearray(a, e - 24)
+  } else if e < 40 {
+    // 2^32 * recurse
+    4294967296 * builtin.index_bytearray(a, e - 32)
+  } else {
+    // Otherwise we can fall back to the built in;
+    // currently we can't fit more than 40 orders in a batch, but if
+    // the protocol parameters get bumped, we don't want things to start failing!
+    // When benchmarking, falling back to the builtin proved to be faster than recursing
+    // unsure why that is!
+    math.pow2(e)
+  }
+}
\ No newline at end of file

From 81c2d68af561c18f4e5e1f4ba3a00650776f1306 Mon Sep 17 00:00:00 2001
From: Pi Lanningham <pi.lanningham@gmail.com>
Date: Mon, 15 Jan 2024 16:06:02 -0500
Subject: [PATCH 3/3] Fix variable names

That's what I get for not running the tests lol
---
 lib/calculation/shared.ak | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/lib/calculation/shared.ak b/lib/calculation/shared.ak
index 943759e..3e95316 100644
--- a/lib/calculation/shared.ak
+++ b/lib/calculation/shared.ak
@@ -78,28 +78,28 @@ pub fn check_and_set_unique(uniqueness_flags: Int, index: Int) -> Int {
 /// This is a version of pow2 that's optimized for small batch sizes
 /// It performs a few more granular loop-unrolls, converging on the small lookup index faster
 /// This was presented by TxPipe, and squeezes out one extra escrow over math.pow2 for our typical order sizes
-pub fn small_pow2(index)  -> Int {
-  let a = #[1, 2, 4, 8, 16, 32, 64, 128]
-  if e < 8 {
-    builtin.index_bytearray(a, e)
-  } else if e < 16 {
-    // 2^8 * recurse
-    256 * builtin.index_bytearray(a, e - 8)
-  } else if e < 24 {
-    // 2^16 * recurse
-    65536 * builtin.index_bytearray(a, e - 16)
-  } else if e < 32 {
+pub fn small_pow2(exponent: Int)  -> Int {
+  let single_byte_powers = #[1, 2, 4, 8, 16, 32, 64, 128]
+  if exponent < 8 {
+    builtin.index_bytearray(single_byte_powers, exponent)
+  } else if exponent < 16 {
+    // 2^8 * table lookup
+    256 * builtin.index_bytearray(single_byte_powers, exponent - 8)
+  } else if exponent < 24 {
+    // 2^16 * table lookup
+    65536 * builtin.index_bytearray(single_byte_powers, exponent - 16)
+  } else if exponent < 32 {
     // 2^24 * recurse
-    16777216 * builtin.index_bytearray(a, e - 24)
-  } else if e < 40 {
+    16777216 * builtin.index_bytearray(single_byte_powers, exponent - 24)
+  } else if exponent < 40 {
     // 2^32 * recurse
-    4294967296 * builtin.index_bytearray(a, e - 32)
+    4294967296 * builtin.index_bytearray(single_byte_powers, exponent - 32)
   } else {
     // Otherwise we can fall back to the built in;
     // currently we can't fit more than 40 orders in a batch, but if
     // the protocol parameters get bumped, we don't want things to start failing!
     // When benchmarking, falling back to the builtin proved to be faster than recursing
     // unsure why that is!
-    math.pow2(e)
+    math.pow2(exponent)
   }
 }
\ No newline at end of file