From 2771fedcb65e330c16bcc422cf19492eecc6cc3e Mon Sep 17 00:00:00 2001
From: Benoit Chevallier-Mames <benoit.chevalliermames@zama.ai>
Date: Mon, 1 Jul 2024 12:00:19 +0200
Subject: [PATCH] docs(frontend): adding a --distance option

---
 .../levenshtein_distance.md                   | 203 ++++++++++++++++++
 .../levenshtein_distance.py                   |  93 +++++---
 2 files changed, 270 insertions(+), 26 deletions(-)
 create mode 100644 frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md

diff --git a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md
new file mode 100644
index 0000000000..9085cc5967
--- /dev/null
+++ b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md
@@ -0,0 +1,203 @@
+# Computing the Levenshtein distance in FHE
+
+## Levenshtein distance
+
+Levenshtein distance is a classical distance to compare two strings. Let's write strings a and b as
+vectors of characters, meaning a[0] is the first char of a and a[1:] is the rest of the string.
+Levenshtein distance is defined as:
+
+    Levenshtein(a, b) :=
+        length(a) if length(b) == 0, or
+        length(b) if length(a) == 0, or
+        Levenshtein(a[1:], b[1:]) if a[0] == b[0], or
+        1 + min(Levenshtein(a[1:], b), Levenshtein(a, b[1:]), Levenshtein(a[1:], b[1:]))
+
+More information can be found for example on the [Wikipedia page](https://en.wikipedia.org/wiki/Levenshtein_distance).
+
+## Computing the distance in FHE
+
+It can be interesting to compute this distance over encrypted data, for example in the banking sector.
+We show in [our code](levenshtein_distance.py) how to do that simply, with our FHE modules.
+
+Available options are:
+
+```
+
+usage: levenshtein_distance.py [-h] [--show_mlir] [--show_optimizer] [--autotest] [--autoperf] [--distance DISTANCE DISTANCE]
+                               [--alphabet {string,STRING,StRiNg,ACTG}] [--max_string_length MAX_STRING_LENGTH]
+
+Levenshtein distance in Concrete.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --show_mlir           Show the MLIR
+  --show_optimizer      Show the optimizer outputs
+  --autotest            Run random tests
+  --autoperf            Run benchmarks
+  --distance DISTANCE DISTANCE
+                        Compute a distance
+  --alphabet {string,STRING,StRiNg,ACTG}
+                        Setting the alphabet
+  --max_string_length MAX_STRING_LENGTH
+                        Setting the maximal size of strings
+```
+
+The different alphabets are:
+- string: non capitalized letters, ie `[a-z]*`
+- STRING: capitalized letters, ie `[A-Z]*`
+- StRiNg: non capitalized letters and capitalized letters
+- ACTG: `[ACTG]*`, for DNA analysis
+
+It is very easy to add a new alphabet in the code.
+
+The most important usages are:
+
+- `levenshtein_distance.py --distance Zama amazing --alphabet StRiNg`: Compute the distance between
+strings "Zama" and "amazing", considering the chars of "StRiNg" alphabet
+
+```
+Running distance between strings aa and ab for alphabet string:
+
+    Computing Levenshtein between strings 'aa' and 'ab' - distance is 1, computed in 4.13 seconds
+
+Successful end
+```
+
+FIXME: re-run when the semantic bug is found
+
+- `levenshtein_distance.py --autotest`: Run random tests with the alphabet.
+
+```
+Making random tests with alphabet string
+Letters are abcdefghijklmnopqrstuvwxyz
+
+Computations in simulation
+
+    Computing Levenshtein between strings '' and '' - OK
+    Computing Levenshtein between strings '' and 'u' - OK
+    Computing Levenshtein between strings '' and 'nh' - OK
+    Computing Levenshtein between strings '' and 'fmf' - OK
+    Computing Levenshtein between strings '' and 'cljm' - OK
+    Computing Levenshtein between strings 'v' and '' - OK
+    Computing Levenshtein between strings 'v' and 'a' - OK
+    Computing Levenshtein between strings 'v' and 'hp' - OK
+    Computing Levenshtein between strings 'g' and 'ktk' - OK
+    Computing Levenshtein between strings 'o' and 'ydqu' - OK
+    Computing Levenshtein between strings 'ke' and '' - OK
+    Computing Levenshtein between strings 'eu' and 'w' - OK
+    Computing Levenshtein between strings 'hi' and 'gz' - OK
+    Computing Levenshtein between strings 'mx' and 'tbw' - OK
+    Computing Levenshtein between strings 'uh' and 'lgad' - OK
+    Computing Levenshtein between strings 'xpj' and '' - OK
+    Computing Levenshtein between strings 'cdt' and 'f' - OK
+    Computing Levenshtein between strings 'trl' and 'rl' - OK
+    Computing Levenshtein between strings 'zai' and 'pqo' - OK
+    Computing Levenshtein between strings 'vac' and 'nrov' - OK
+    Computing Levenshtein between strings 'rnay' and '' - OK
+    Computing Levenshtein between strings 'xnfg' and 'o' - OK
+    Computing Levenshtein between strings 'jdgl' and 'ra' - OK
+    Computing Levenshtein between strings 'wpyq' and 'jxp' - OK
+    Computing Levenshtein between strings 'enpt' and 'hvfb' - OK
+
+Computations in FHE
+
+    Computing Levenshtein between strings '' and '' - OK in 0.01 seconds
+    Computing Levenshtein between strings '' and 'u' - OK in 0.01 seconds
+    Computing Levenshtein between strings '' and 'nh' - OK in 0.01 seconds
+    Computing Levenshtein between strings '' and 'fmf' - OK in 0.01 seconds
+    Computing Levenshtein between strings '' and 'cljm' - OK in 0.01 seconds
+    Computing Levenshtein between strings 'v' and '' - OK in 0.01 seconds
+    Computing Levenshtein between strings 'v' and 'a' - OK in 1.75 seconds
+    Computing Levenshtein between strings 'v' and 'hp' - OK in 1.77 seconds
+    Computing Levenshtein between strings 'g' and 'ktk' - OK in 2.78 seconds
+    Computing Levenshtein between strings 'o' and 'ydqu' - OK in 3.61 seconds
+    Computing Levenshtein between strings 'ke' and '' - OK in 0.01 seconds
+    Computing Levenshtein between strings 'eu' and 'w' - OK in 1.73 seconds
+    Computing Levenshtein between strings 'hi' and 'gz' - OK in 3.53 seconds
+    Computing Levenshtein between strings 'mx' and 'tbw' - OK in 5.25 seconds
+    Computing Levenshtein between strings 'uh' and 'lgad' - OK in 7.21 seconds
+    Computing Levenshtein between strings 'xpj' and '' - OK in 0.01 seconds
+    Computing Levenshtein between strings 'cdt' and 'f' - OK in 2.53 seconds
+    Computing Levenshtein between strings 'trl' and 'rl' - OK in 5.32 seconds
+    Computing Levenshtein between strings 'zai' and 'pqo' - OK in 7.93 seconds
+    Computing Levenshtein between strings 'vac' and 'nrov' - OK in 10.73 seconds
+    Computing Levenshtein between strings 'rnay' and '' - OK in 0.01 seconds
+    Computing Levenshtein between strings 'xnfg' and 'o' - OK in 3.50 seconds
+    Computing Levenshtein between strings 'jdgl' and 'ra' - OK in 7.01 seconds
+    Computing Levenshtein between strings 'wpyq' and 'jxp' - OK in 10.67 seconds
+    Computing Levenshtein between strings 'enpt' and 'hvfb' - OK in 14.30 seconds
+
+Successful end
+```
+
+- `levenshtein_distance.py --autoperf`: Benchmark with random strings, for the different alphabets.
+
+```
+
+Typical performances for alphabet ACTG, with string of maximal length:
+
+    Computing Levenshtein between strings 'GGAA' and 'AATT' - OK in 5.12 seconds
+    Computing Levenshtein between strings 'TGCG' and 'ACAG' - OK in 5.00 seconds
+    Computing Levenshtein between strings 'ATAC' and 'CTAA' - OK in 4.94 seconds
+
+Typical performances for alphabet string, with string of maximal length:
+
+    Computing Levenshtein between strings 'mtpp' and 'qujk' - OK in 15.48 seconds
+    Computing Levenshtein between strings 'sucl' and 'teeu' - OK in 14.22 seconds
+    Computing Levenshtein between strings 'prej' and 'latp' - OK in 14.07 seconds
+
+Typical performances for alphabet STRING, with string of maximal length:
+
+    Computing Levenshtein between strings 'ATRC' and 'VHCZ' - OK in 15.65 seconds
+    Computing Levenshtein between strings 'BOPL' and 'AUVT' - OK in 14.38 seconds
+    Computing Levenshtein between strings 'AMLK' and 'HEZX' - OK in 14.22 seconds
+
+Typical performances for alphabet StRiNg, with string of maximal length:
+
+    Computing Levenshtein between strings 'uIWB' and 'aYZR' - OK in 29.01 seconds
+    Computing Levenshtein between strings 'adWI' and 'OXyg' - OK in 27.17 seconds
+    Computing Levenshtein between strings 'jvhQ' and 'Weug' - OK in 26.55 seconds
+
+Successful end
+
+```
+
+## Benchmarks on hpc7a
+
+The benchmarks were done using Concrete 2.7 on `hpc7a` machine on AWS, and give:
+
+```
+Typical performances for alphabet ACTG, with string of maximal length:
+
+    Computing Levenshtein between strings 'GGAA' and 'AATT' - OK in 5.12 seconds
+    Computing Levenshtein between strings 'TGCG' and 'ACAG' - OK in 5.00 seconds
+    Computing Levenshtein between strings 'ATAC' and 'CTAA' - OK in 4.94 seconds
+
+Typical performances for alphabet string, with string of maximal length:
+
+    Computing Levenshtein between strings 'mtpp' and 'qujk' - OK in 15.48 seconds
+    Computing Levenshtein between strings 'sucl' and 'teeu' - OK in 14.22 seconds
+    Computing Levenshtein between strings 'prej' and 'latp' - OK in 14.07 seconds
+
+Typical performances for alphabet STRING, with string of maximal length:
+
+    Computing Levenshtein between strings 'ATRC' and 'VHCZ' - OK in 15.65 seconds
+    Computing Levenshtein between strings 'BOPL' and 'AUVT' - OK in 14.38 seconds
+    Computing Levenshtein between strings 'AMLK' and 'HEZX' - OK in 14.22 seconds
+
+Typical performances for alphabet StRiNg, with string of maximal length:
+
+    Computing Levenshtein between strings 'uIWB' and 'aYZR' - OK in 29.01 seconds
+    Computing Levenshtein between strings 'adWI' and 'OXyg' - OK in 27.17 seconds
+    Computing Levenshtein between strings 'jvhQ' and 'Weug' - OK in 26.55 seconds
+
+Successful end
+
+```
+
+FIXME: re-run the benchmarks on AWS
+
+
+
+
+
diff --git a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py
index 77c3d0cb1c..1a7cb0a22c 100644
--- a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py
+++ b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py
@@ -9,6 +9,31 @@
 
 from concrete import fhe
 
+
+def random_pick_in_values(mapping_to_int):
+    """Pick the integer-encoding of a random char in an alphabet."""
+    return numpy.random.randint(len(mapping_to_int))
+
+
+def random_pick_in_keys(mapping_to_int):
+    """Pick a random char in an alphabet."""
+    return random.choice(list(mapping_to_int))
+
+
+def random_string(mapping_to_int, l):
+    """Pick a random string in the alphabet."""
+    return "".join([random_pick_in_keys(mapping_to_int) for _ in range(l)])
+
+
+def check_string_is_in_alphabet(string, mapping_to_int):
+    """Check a string is a valid string of an alphabet."""
+    for c in string:
+        if c not in mapping_to_int:
+            raise ValueError(
+                f"Char {c} of {string} is not in alphabet {list(mapping_to_int.keys())}, please choose the right --alphabet"
+            )
+
+
 # Module FHE
 @fhe.module()
 class MyModule:
@@ -50,21 +75,9 @@ def mix(is_equal, if_equal, case_1, case_2, case_3):
     )
 
 
-def random_pick_in_values(mapping_to_int):
-    return numpy.random.randint(len(mapping_to_int))
-
-
-def random_pick_in_keys(mapping_to_int):
-    return random.choice(list(mapping_to_int))
-
-
-def random_string(mapping_to_int, l):
-    return "".join([random_pick_in_keys(mapping_to_int) for _ in range(l)])
-
-
-# Function in clear, for reference and comparison
 @lru_cache
 def levenshtein_clear(x, y):
+    """Compute the distance in clear, for reference and comparison."""
     if len(x) == 0:
         return len(y)
     if len(y) == 0:
@@ -80,9 +93,9 @@ def levenshtein_clear(x, y):
     return 1 + min(case_1, case_2, case_3)
 
 
-# Function in FHE-simulate, to debug
 @lru_cache
 def levenshtein_simulate(my_module, x, y):
+    """Compute the distance in simulation."""
     if len(x) == 0:
         return len(y)
     if len(y) == 0:
@@ -99,14 +112,12 @@ def levenshtein_simulate(my_module, x, y):
     return returned_value
 
 
-# Function in FHE
 @lru_cache
 def levenshtein_fhe(my_module, x, y):
+    """Compute the distance in FHE."""
     if len(x) == 0:
-        # In clear, that's return len(y)
         return my_module.mix.encrypt(None, len(y), None, None, None)[1]
     if len(y) == 0:
-        # In clear, that's return len(x)
         return my_module.mix.encrypt(None, len(x), None, None, None)[1]
 
     if_equal = levenshtein_fhe(my_module, x[1:], y[1:])
@@ -114,15 +125,14 @@ def levenshtein_fhe(my_module, x, y):
     case_2 = levenshtein_fhe(my_module, x, y[1:])
     case_3 = if_equal
 
-    # In FHE
     is_equal = my_module.equal.run(x[0], y[0])
     returned_value = my_module.mix.run(is_equal, if_equal, case_1, case_2, case_3)
 
     return returned_value
 
 
-# Manage user args
 def manage_args():
+    """Manage user arguments."""
     parser = argparse.ArgumentParser(description="Levenshtein distance in Concrete.")
     parser.add_argument(
         "--show_mlir",
@@ -148,6 +158,14 @@ def manage_args():
         action="store_true",
         help="Run benchmarks",
     )
+    parser.add_argument(
+        "--distance",
+        dest="distance",
+        nargs=2,
+        type=str,
+        action="store",
+        help="Compute a distance",
+    )
     parser.add_argument(
         "--alphabet",
         dest="alphabet",
@@ -163,11 +181,17 @@ def manage_args():
         help="Setting the maximal size of strings",
     )
     args = parser.parse_args()
+
+    # At least one option
+    assert (
+        args.autoperf + args.autotest + (args.distance != None) > 0
+    ), "must activate one option --autoperf or --autotest or --distance"
+
     return args
 
 
 def compile_module(mapping_to_int, args):
-    # Compilation
+    """Compile the FHE module."""
     inputset_equal = [
         (random_pick_in_values(mapping_to_int), random_pick_in_values(mapping_to_int))
         for _ in range(1000)
@@ -196,6 +220,7 @@ def compile_module(mapping_to_int, args):
 
 
 def prepare_alphabet_mapping(alphabet, verbose=True):
+    """Check the alphabet option and compute corresponding char-to-int mapping."""
     if alphabet == "string":
         letters = "".join([chr(97 + i) for i in range(26)])
     elif alphabet == "STRING":
@@ -220,7 +245,7 @@ def prepare_alphabet_mapping(alphabet, verbose=True):
 
 
 def prepare_random_patterns(mapping_to_int, len_min, len_max, nb_strings):
-    # Random patterns of different lengths
+    """Prepare random patterns of different lengths."""
     list_patterns = []
     for _ in range(nb_strings):
         for length_1 in range(len_min, len_max + 1):
@@ -237,8 +262,7 @@ def prepare_random_patterns(mapping_to_int, len_min, len_max, nb_strings):
 
 
 def compute_in_simulation(my_module, list_patterns, mapping_to_int):
-
-    # Checks in simulation
+    """Check equality between distance in simulation and clear distance."""
     print("Computations in simulation\n")
 
     for a, b in list_patterns:
@@ -255,8 +279,8 @@ def compute_in_simulation(my_module, list_patterns, mapping_to_int):
         print(" - OK")
 
 
-def compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False):
-    # Key generation
+def compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=True, show_distance=False):
+    """Check equality between distance in FHE and clear distance."""
     my_module.keygen()
 
     # Checks in FHE
@@ -282,10 +306,15 @@ def compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False):
         l1_clear = levenshtein_clear(a, b)
 
         assert l1_fhe == l1_clear, f"    {l1_fhe=} and {l1_clear=} are different"
-        print(f" - OK in {time_end - time_begin:.2f} seconds")
+
+        if not show_distance:
+            print(f" - OK in {time_end - time_begin:.2f} seconds")
+        else:
+            print(f" - distance is {l1_fhe}, computed in {time_end - time_begin:.2f} seconds")
 
 
 def main():
+    """Main function."""
     print()
 
     # Options by the user
@@ -311,6 +340,18 @@ def main():
             compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False)
             print("")
 
+    if args.distance != None:
+        print(
+            f"Running distance between strings {args.distance[0]} and {args.distance[1]} for alphabet {args.alphabet}:\n"
+        )
+        mapping_to_int = prepare_alphabet_mapping(args.alphabet, verbose=False)
+        my_module = compile_module(mapping_to_int, args)
+        check_string_is_in_alphabet(args.distance[0], mapping_to_int)
+        check_string_is_in_alphabet(args.distance[1], mapping_to_int)
+        list_patterns = [args.distance]
+        compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False, show_distance=True)
+        print("")
+
     print("Successful end\n")