From c58e37af1055058ba3a17395f4e4103304de2945 Mon Sep 17 00:00:00 2001
From: Ghislain Vaillant <ghislain.vaillant@inria.fr>
Date: Tue, 11 Jun 2024 16:04:13 +0200
Subject: [PATCH] BUG: Reduce memory overhead in simstring db

Closes #52
---
 medkit/text/ner/_base_simstring_matcher.py | 46 +++++++++-------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py
index aeff4e49..0e582535 100644
--- a/medkit/text/ner/_base_simstring_matcher.py
+++ b/medkit/text/ner/_base_simstring_matcher.py
@@ -7,6 +7,7 @@
     "build_simstring_matcher_databases",
 ]
 
+import collections
 import dataclasses
 import math
 import re
@@ -385,35 +386,24 @@ def build_simstring_matcher_databases(
     rules : iterable of BaseSimstringMatcherRule
         Rules to add to databases
     """
-    # the params passed to simstring.writer are copy/pasted from QuickUMLS
-    # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173
-    simstring_db_writer = simstring.writer(
-        str(simstring_db_file),
-        3,  # unit of character n-grams
-        False,  # represent begin and end of strings in n-grams
-        True,  # use unicode mode
-    )
-
-    # writeback=True needed because we are updating the values in the mapping,
-    # not just writing
-    rules_db = shelve.open(str(rules_db_file), flag="n", writeback=True)  # noqa: S301
-
-    # add rules to databases
+    # Prepare rules mapping for persistence, as:
+    # term -> list of rules
+    rules_mapping = collections.defaultdict(list)
     for rule in rules:
-        term_to_match = rule.term
-
-        # apply preprocessing
-        term_to_match = anyascii(term_to_match.lower())
-
-        # add to simstring db
-        simstring_db_writer.insert(term_to_match)
-        # add to rules db
-        if term_to_match not in rules_db:
-            rules_db[term_to_match] = []
-        rules_db[term_to_match].append(rule)
-    simstring_db_writer.close()
-    rules_db.sync()
-    rules_db.close()
+        term = anyascii(rule.term.lower())
+        rules_mapping[term].append(rule)
+
+    # Persist rules mapping in new shelf.
+    with shelve.open(str(rules_db_file), flag="n") as rules_db:  # noqa: S301
+        rules_db.update(rules_mapping)
+
+    # Update simstring db with terms in rules mapping.
+    # The simstring.writer parameters are taken from QuickUMLS,
+    # see https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L169.
+    simstring_db = simstring.writer(str(simstring_db_file), n=3, be=False, unicode=True)
+    for term in rules_mapping:
+        simstring_db.insert(term)
+    simstring_db.close()
 
 
 _TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]")