From 2771fedcb65e330c16bcc422cf19492eecc6cc3e Mon Sep 17 00:00:00 2001 From: Benoit Chevallier-Mames Date: Mon, 1 Jul 2024 12:00:19 +0200 Subject: [PATCH] docs(frontend): adding a --distance option --- .../levenshtein_distance.md | 203 ++++++++++++++++++ .../levenshtein_distance.py | 93 +++++--- 2 files changed, 270 insertions(+), 26 deletions(-) create mode 100644 frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md diff --git a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md new file mode 100644 index 0000000000..9085cc5967 --- /dev/null +++ b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.md @@ -0,0 +1,203 @@ +# Computing the Levenshtein distance in FHE + +## Levenshtein distance + +Levenshtein distance is a classical distance to compare two strings. Let's write strings a and b as +vectors of characters, meaning a[0] is the first char of a and a[1:] is the rest of the string. +Levenshtein distance is defined as: + + Levenshtein(a, b) := + length(a) if length(b) == 0, or + length(b) if length(a) == 0, or + Levenshtein(a[1:], b[1:]) if a[0] == b[0], or + 1 + min(Levenshtein(a[1:], b), Levenshtein(a, b[1:]), Levenshtein(a[1:], b[1:])) + +More information can be found for example on the [Wikipedia page](https://en.wikipedia.org/wiki/Levenshtein_distance). + +## Computing the distance in FHE + +It can be interesting to compute this distance over encrypted data, for example in the banking sector. +We show in [our code](levenshtein_distance.py) how to do that simply, with our FHE modules. + +Available options are: + +``` + +usage: levenshtein_distance.py [-h] [--show_mlir] [--show_optimizer] [--autotest] [--autoperf] [--distance DISTANCE DISTANCE] + [--alphabet {string,STRING,StRiNg,ACTG}] [--max_string_length MAX_STRING_LENGTH] + +Levenshtein distance in Concrete. + +optional arguments: + -h, --help show this help message and exit + --show_mlir Show the MLIR + --show_optimizer Show the optimizer outputs + --autotest Run random tests + --autoperf Run benchmarks + --distance DISTANCE DISTANCE + Compute a distance + --alphabet {string,STRING,StRiNg,ACTG} + Setting the alphabet + --max_string_length MAX_STRING_LENGTH + Setting the maximal size of strings +``` + +The different alphabets are: +- string: non capitalized letters, ie `[a-z]*` +- STRING: capitalized letters, ie `[A-Z]*` +- StRiNg: non capitalized letters and capitalized letters +- ACTG: `[ACTG]*`, for DNA analysis + +It is very easy to add a new alphabet in the code. + +The most important usages are: + +- `levenshtein_distance.py --distance Zama amazing --alphabet StRiNg`: Compute the distance between +strings "Zama" and "amazing", considering the chars of "StRiNg" alphabet + +``` +Running distance between strings aa and ab for alphabet string: + + Computing Levenshtein between strings 'aa' and 'ab' - distance is 1, computed in 4.13 seconds + +Successful end +``` + +FIXME: re-run when the semantic bug is found + +- `levenshtein_distance.py --autotest`: Run random tests with the alphabet. + +``` +Making random tests with alphabet string +Letters are abcdefghijklmnopqrstuvwxyz + +Computations in simulation + + Computing Levenshtein between strings '' and '' - OK + Computing Levenshtein between strings '' and 'u' - OK + Computing Levenshtein between strings '' and 'nh' - OK + Computing Levenshtein between strings '' and 'fmf' - OK + Computing Levenshtein between strings '' and 'cljm' - OK + Computing Levenshtein between strings 'v' and '' - OK + Computing Levenshtein between strings 'v' and 'a' - OK + Computing Levenshtein between strings 'v' and 'hp' - OK + Computing Levenshtein between strings 'g' and 'ktk' - OK + Computing Levenshtein between strings 'o' and 'ydqu' - OK + Computing Levenshtein between strings 'ke' and '' - OK + Computing Levenshtein between strings 'eu' and 'w' - OK + Computing Levenshtein between strings 'hi' and 'gz' - OK + Computing Levenshtein between strings 'mx' and 'tbw' - OK + Computing Levenshtein between strings 'uh' and 'lgad' - OK + Computing Levenshtein between strings 'xpj' and '' - OK + Computing Levenshtein between strings 'cdt' and 'f' - OK + Computing Levenshtein between strings 'trl' and 'rl' - OK + Computing Levenshtein between strings 'zai' and 'pqo' - OK + Computing Levenshtein between strings 'vac' and 'nrov' - OK + Computing Levenshtein between strings 'rnay' and '' - OK + Computing Levenshtein between strings 'xnfg' and 'o' - OK + Computing Levenshtein between strings 'jdgl' and 'ra' - OK + Computing Levenshtein between strings 'wpyq' and 'jxp' - OK + Computing Levenshtein between strings 'enpt' and 'hvfb' - OK + +Computations in FHE + + Computing Levenshtein between strings '' and '' - OK in 0.01 seconds + Computing Levenshtein between strings '' and 'u' - OK in 0.01 seconds + Computing Levenshtein between strings '' and 'nh' - OK in 0.01 seconds + Computing Levenshtein between strings '' and 'fmf' - OK in 0.01 seconds + Computing Levenshtein between strings '' and 'cljm' - OK in 0.01 seconds + Computing Levenshtein between strings 'v' and '' - OK in 0.01 seconds + Computing Levenshtein between strings 'v' and 'a' - OK in 1.75 seconds + Computing Levenshtein between strings 'v' and 'hp' - OK in 1.77 seconds + Computing Levenshtein between strings 'g' and 'ktk' - OK in 2.78 seconds + Computing Levenshtein between strings 'o' and 'ydqu' - OK in 3.61 seconds + Computing Levenshtein between strings 'ke' and '' - OK in 0.01 seconds + Computing Levenshtein between strings 'eu' and 'w' - OK in 1.73 seconds + Computing Levenshtein between strings 'hi' and 'gz' - OK in 3.53 seconds + Computing Levenshtein between strings 'mx' and 'tbw' - OK in 5.25 seconds + Computing Levenshtein between strings 'uh' and 'lgad' - OK in 7.21 seconds + Computing Levenshtein between strings 'xpj' and '' - OK in 0.01 seconds + Computing Levenshtein between strings 'cdt' and 'f' - OK in 2.53 seconds + Computing Levenshtein between strings 'trl' and 'rl' - OK in 5.32 seconds + Computing Levenshtein between strings 'zai' and 'pqo' - OK in 7.93 seconds + Computing Levenshtein between strings 'vac' and 'nrov' - OK in 10.73 seconds + Computing Levenshtein between strings 'rnay' and '' - OK in 0.01 seconds + Computing Levenshtein between strings 'xnfg' and 'o' - OK in 3.50 seconds + Computing Levenshtein between strings 'jdgl' and 'ra' - OK in 7.01 seconds + Computing Levenshtein between strings 'wpyq' and 'jxp' - OK in 10.67 seconds + Computing Levenshtein between strings 'enpt' and 'hvfb' - OK in 14.30 seconds + +Successful end +``` + +- `levenshtein_distance.py --autoperf`: Benchmark with random strings, for the different alphabets. + +``` + +Typical performances for alphabet ACTG, with string of maximal length: + + Computing Levenshtein between strings 'GGAA' and 'AATT' - OK in 5.12 seconds + Computing Levenshtein between strings 'TGCG' and 'ACAG' - OK in 5.00 seconds + Computing Levenshtein between strings 'ATAC' and 'CTAA' - OK in 4.94 seconds + +Typical performances for alphabet string, with string of maximal length: + + Computing Levenshtein between strings 'mtpp' and 'qujk' - OK in 15.48 seconds + Computing Levenshtein between strings 'sucl' and 'teeu' - OK in 14.22 seconds + Computing Levenshtein between strings 'prej' and 'latp' - OK in 14.07 seconds + +Typical performances for alphabet STRING, with string of maximal length: + + Computing Levenshtein between strings 'ATRC' and 'VHCZ' - OK in 15.65 seconds + Computing Levenshtein between strings 'BOPL' and 'AUVT' - OK in 14.38 seconds + Computing Levenshtein between strings 'AMLK' and 'HEZX' - OK in 14.22 seconds + +Typical performances for alphabet StRiNg, with string of maximal length: + + Computing Levenshtein between strings 'uIWB' and 'aYZR' - OK in 29.01 seconds + Computing Levenshtein between strings 'adWI' and 'OXyg' - OK in 27.17 seconds + Computing Levenshtein between strings 'jvhQ' and 'Weug' - OK in 26.55 seconds + +Successful end + +``` + +## Benchmarks on hpc7a + +The benchmarks were done using Concrete 2.7 on `hpc7a` machine on AWS, and give: + +``` +Typical performances for alphabet ACTG, with string of maximal length: + + Computing Levenshtein between strings 'GGAA' and 'AATT' - OK in 5.12 seconds + Computing Levenshtein between strings 'TGCG' and 'ACAG' - OK in 5.00 seconds + Computing Levenshtein between strings 'ATAC' and 'CTAA' - OK in 4.94 seconds + +Typical performances for alphabet string, with string of maximal length: + + Computing Levenshtein between strings 'mtpp' and 'qujk' - OK in 15.48 seconds + Computing Levenshtein between strings 'sucl' and 'teeu' - OK in 14.22 seconds + Computing Levenshtein between strings 'prej' and 'latp' - OK in 14.07 seconds + +Typical performances for alphabet STRING, with string of maximal length: + + Computing Levenshtein between strings 'ATRC' and 'VHCZ' - OK in 15.65 seconds + Computing Levenshtein between strings 'BOPL' and 'AUVT' - OK in 14.38 seconds + Computing Levenshtein between strings 'AMLK' and 'HEZX' - OK in 14.22 seconds + +Typical performances for alphabet StRiNg, with string of maximal length: + + Computing Levenshtein between strings 'uIWB' and 'aYZR' - OK in 29.01 seconds + Computing Levenshtein between strings 'adWI' and 'OXyg' - OK in 27.17 seconds + Computing Levenshtein between strings 'jvhQ' and 'Weug' - OK in 26.55 seconds + +Successful end + +``` + +FIXME: re-run the benchmarks on AWS + + + + + diff --git a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py index 77c3d0cb1c..1a7cb0a22c 100644 --- a/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py +++ b/frontends/concrete-python/examples/levenshtein_distance/levenshtein_distance.py @@ -9,6 +9,31 @@ from concrete import fhe + +def random_pick_in_values(mapping_to_int): + """Pick the integer-encoding of a random char in an alphabet.""" + return numpy.random.randint(len(mapping_to_int)) + + +def random_pick_in_keys(mapping_to_int): + """Pick a random char in an alphabet.""" + return random.choice(list(mapping_to_int)) + + +def random_string(mapping_to_int, l): + """Pick a random string in the alphabet.""" + return "".join([random_pick_in_keys(mapping_to_int) for _ in range(l)]) + + +def check_string_is_in_alphabet(string, mapping_to_int): + """Check a string is a valid string of an alphabet.""" + for c in string: + if c not in mapping_to_int: + raise ValueError( + f"Char {c} of {string} is not in alphabet {list(mapping_to_int.keys())}, please choose the right --alphabet" + ) + + # Module FHE @fhe.module() class MyModule: @@ -50,21 +75,9 @@ def mix(is_equal, if_equal, case_1, case_2, case_3): ) -def random_pick_in_values(mapping_to_int): - return numpy.random.randint(len(mapping_to_int)) - - -def random_pick_in_keys(mapping_to_int): - return random.choice(list(mapping_to_int)) - - -def random_string(mapping_to_int, l): - return "".join([random_pick_in_keys(mapping_to_int) for _ in range(l)]) - - -# Function in clear, for reference and comparison @lru_cache def levenshtein_clear(x, y): + """Compute the distance in clear, for reference and comparison.""" if len(x) == 0: return len(y) if len(y) == 0: @@ -80,9 +93,9 @@ def levenshtein_clear(x, y): return 1 + min(case_1, case_2, case_3) -# Function in FHE-simulate, to debug @lru_cache def levenshtein_simulate(my_module, x, y): + """Compute the distance in simulation.""" if len(x) == 0: return len(y) if len(y) == 0: @@ -99,14 +112,12 @@ def levenshtein_simulate(my_module, x, y): return returned_value -# Function in FHE @lru_cache def levenshtein_fhe(my_module, x, y): + """Compute the distance in FHE.""" if len(x) == 0: - # In clear, that's return len(y) return my_module.mix.encrypt(None, len(y), None, None, None)[1] if len(y) == 0: - # In clear, that's return len(x) return my_module.mix.encrypt(None, len(x), None, None, None)[1] if_equal = levenshtein_fhe(my_module, x[1:], y[1:]) @@ -114,15 +125,14 @@ def levenshtein_fhe(my_module, x, y): case_2 = levenshtein_fhe(my_module, x, y[1:]) case_3 = if_equal - # In FHE is_equal = my_module.equal.run(x[0], y[0]) returned_value = my_module.mix.run(is_equal, if_equal, case_1, case_2, case_3) return returned_value -# Manage user args def manage_args(): + """Manage user arguments.""" parser = argparse.ArgumentParser(description="Levenshtein distance in Concrete.") parser.add_argument( "--show_mlir", @@ -148,6 +158,14 @@ def manage_args(): action="store_true", help="Run benchmarks", ) + parser.add_argument( + "--distance", + dest="distance", + nargs=2, + type=str, + action="store", + help="Compute a distance", + ) parser.add_argument( "--alphabet", dest="alphabet", @@ -163,11 +181,17 @@ def manage_args(): help="Setting the maximal size of strings", ) args = parser.parse_args() + + # At least one option + assert ( + args.autoperf + args.autotest + (args.distance != None) > 0 + ), "must activate one option --autoperf or --autotest or --distance" + return args def compile_module(mapping_to_int, args): - # Compilation + """Compile the FHE module.""" inputset_equal = [ (random_pick_in_values(mapping_to_int), random_pick_in_values(mapping_to_int)) for _ in range(1000) @@ -196,6 +220,7 @@ def compile_module(mapping_to_int, args): def prepare_alphabet_mapping(alphabet, verbose=True): + """Check the alphabet option and compute corresponding char-to-int mapping.""" if alphabet == "string": letters = "".join([chr(97 + i) for i in range(26)]) elif alphabet == "STRING": @@ -220,7 +245,7 @@ def prepare_alphabet_mapping(alphabet, verbose=True): def prepare_random_patterns(mapping_to_int, len_min, len_max, nb_strings): - # Random patterns of different lengths + """Prepare random patterns of different lengths.""" list_patterns = [] for _ in range(nb_strings): for length_1 in range(len_min, len_max + 1): @@ -237,8 +262,7 @@ def prepare_random_patterns(mapping_to_int, len_min, len_max, nb_strings): def compute_in_simulation(my_module, list_patterns, mapping_to_int): - - # Checks in simulation + """Check equality between distance in simulation and clear distance.""" print("Computations in simulation\n") for a, b in list_patterns: @@ -255,8 +279,8 @@ def compute_in_simulation(my_module, list_patterns, mapping_to_int): print(" - OK") -def compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False): - # Key generation +def compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=True, show_distance=False): + """Check equality between distance in FHE and clear distance.""" my_module.keygen() # Checks in FHE @@ -282,10 +306,15 @@ def compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False): l1_clear = levenshtein_clear(a, b) assert l1_fhe == l1_clear, f" {l1_fhe=} and {l1_clear=} are different" - print(f" - OK in {time_end - time_begin:.2f} seconds") + + if not show_distance: + print(f" - OK in {time_end - time_begin:.2f} seconds") + else: + print(f" - distance is {l1_fhe}, computed in {time_end - time_begin:.2f} seconds") def main(): + """Main function.""" print() # Options by the user @@ -311,6 +340,18 @@ def main(): compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False) print("") + if args.distance != None: + print( + f"Running distance between strings {args.distance[0]} and {args.distance[1]} for alphabet {args.alphabet}:\n" + ) + mapping_to_int = prepare_alphabet_mapping(args.alphabet, verbose=False) + my_module = compile_module(mapping_to_int, args) + check_string_is_in_alphabet(args.distance[0], mapping_to_int) + check_string_is_in_alphabet(args.distance[1], mapping_to_int) + list_patterns = [args.distance] + compute_in_fhe(my_module, list_patterns, mapping_to_int, verbose=False, show_distance=True) + print("") + print("Successful end\n")