From fa390306f616c1d8a99323b6fee476d65404facf Mon Sep 17 00:00:00 2001
From: Charles Perier <ch4rles.perier@gmail.com>
Date: Wed, 31 Jan 2024 23:51:09 +0100
Subject: [PATCH 1/3] remove stopwords from normalised tags on node update

---
 backend/editor/entries.py                     | 29 +++++++++++++++++--
 .../parser/taxonomy_parser.py                 | 19 ++----------
 parser/openfoodfacts_taxonomy_parser/utils.py | 11 ++++++-
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/backend/editor/entries.py b/backend/editor/entries.py
index 5a712d6d..8bf2aca1 100644
--- a/backend/editor/entries.py
+++ b/backend/editor/entries.py
@@ -60,13 +60,14 @@ async def create_node(self, label, entry, main_language_code):
         """
         params = {"id": entry}
         query = [f"""CREATE (n:{self.project_name}:{label})\n"""]
+        stopwords = await self.get_stopwords_dict()
 
         # Build all basic keys of a node
         if label == "ENTRY":
             # Normalizing new canonical tag
             language_code, canonical_tag = entry.split(":", 1)
             normalised_canonical_tag = parser_utils.normalize_text(
-                canonical_tag, main_language_code
+                canonical_tag, main_language_code, stopwords=stopwords
             )
 
             # Reconstructing and updation of node ID
@@ -438,6 +439,27 @@ async def get_children(self, entry):
         result = await get_current_transaction().run(query, {"id": entry})
         return await async_list(result)
 
+    async def get_stopwords_dict(self):
+        """
+        Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary
+        where the keys are the language codes and the values are the stopwords in the corresponding language
+        """
+        query = f"""
+            MATCH (s:{self.project_name}:STOPWORDS) 
+            WITH keys(s) AS properties, s 
+            UNWIND properties AS property 
+            WITH s, property 
+            WHERE property STARTS WITH 'tags_ids' 
+            RETURN property AS tags_ids_lc, s[property] AS stopwords
+        """
+        result = await get_current_transaction().run(query)
+        records = await async_list(result)
+        stopwords_dict = {}
+        for record in records:
+            language_code = record["tags_ids_lc"].split("_")[-1]
+            stopwords_dict[language_code] = record["stopwords"]
+        return stopwords_dict
+
     async def update_node(self, label, entry, new_node):
         """
         Helper function used for updation of node with given id and label
@@ -471,6 +493,7 @@ async def update_node(self, label, entry, new_node):
 
         # Adding normalized tags ids corresponding to entry tags
         normalised_new_node = {}
+        stopwords = await self.get_stopwords_dict()
         for key in set(new_node.keys()) - deleted_keys:
             if key.startswith("tags_"):
                 if "_ids_" not in key:
@@ -478,7 +501,9 @@ async def update_node(self, label, entry, new_node):
                     normalised_value = []
                     for value in new_node[key]:
                         normalised_value.append(
-                            parser_utils.normalize_text(value, keys_language_code)
+                            parser_utils.normalize_text(
+                                value, keys_language_code, stopwords=stopwords
+                            )
                         )
                     normalised_new_node[key] = new_node[key]
                     normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
index ff91f723..bb817259 100644
--- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
+++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -98,19 +98,6 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
                 line_count += 1
             yield line_count, ""  # to end the last entry if not ended
 
-    def _remove_stopwords(self, lc: str, words: str) -> str:
-        """Remove the stopwords that were read at the beginning of the file"""
-        # First check if this language has stopwords
-        if lc in self.stopwords:
-            words_to_remove = self.stopwords[lc]
-            new_words = []
-            for word in words.split("-"):
-                if word not in words_to_remove:
-                    new_words.append(word)
-            return ("-").join(new_words)
-        else:
-            return words
-
     def _add_line(self, line: str) -> str:
         """
         Get a normalized string but keeping the language code "lc:",
@@ -118,7 +105,7 @@ def _add_line(self, line: str) -> str:
         """
         lc, line = line.split(":", 1)
         new_line = lc + ":"
-        new_line += self._remove_stopwords(lc, normalize_text(line, lc))
+        new_line += normalize_text(line, lc, stopwords=self.stopwords)
         return new_line
 
     def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
@@ -126,7 +113,7 @@ def _get_lc_value(self, line: str) -> tuple[str, list[str]]:
         lc, line = line.split(":", 1)
         new_line: list[str] = []
         for word in line.split(","):
-            new_line.append(self._remove_stopwords(lc, normalize_text(word, lc)))
+            new_line.append(normalize_text(word, lc, stopwords=self.stopwords))
         return lc, new_line
 
     def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData:
@@ -291,7 +278,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     tagsids_list = []
                     for word in line.split(","):
                         tags_list.append(word.strip())
-                        word_normalized = self._remove_stopwords(lang, normalize_text(word, lang))
+                        word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
                         if word_normalized not in tagsids_list:
                             # in case 2 normalized synonyms are the same
                             tagsids_list.append(word_normalized)
diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py
index 8b942186..0e343ef4 100644
--- a/parser/openfoodfacts_taxonomy_parser/utils.py
+++ b/parser/openfoodfacts_taxonomy_parser/utils.py
@@ -4,7 +4,7 @@
 import unidecode
 
 
-def normalize_text(line: str, lang="default", char="-"):
+def normalize_text(line: str, lang="default", char="-", stopwords={}):
     """Normalize a string depending on the language code"""
     line = unicodedata.normalize("NFC", line)
 
@@ -29,6 +29,15 @@ def normalize_text(line: str, lang="default", char="-"):
     # Removing excess "-"
     line = re.sub(r"-+", char, line)
     line = line.strip(char)
+
+    # Remove stopwords
+    if lang in stopwords:
+        stopwords = stopwords[lang]
+        line_surrounded_by_char = char + line + char
+        for stopword in stopwords:
+            line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char)
+        line = line_surrounded_by_char[1:-1]
+
     return line
 
 

From e7f14b100a84b51efb305a09f9d0cd3855a223ef Mon Sep 17 00:00:00 2001
From: Charles Perier <ch4rles.perier@gmail.com>
Date: Thu, 1 Feb 2024 00:02:21 +0100
Subject: [PATCH 2/3] format

---
 backend/editor/entries.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/backend/editor/entries.py b/backend/editor/entries.py
index 8bf2aca1..1e448112 100644
--- a/backend/editor/entries.py
+++ b/backend/editor/entries.py
@@ -442,14 +442,15 @@ async def get_children(self, entry):
     async def get_stopwords_dict(self):
         """
         Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary
-        where the keys are the language codes and the values are the stopwords in the corresponding language
+        where the keys are the language codes, and the values are the stopwords in the
+        corresponding language
         """
         query = f"""
-            MATCH (s:{self.project_name}:STOPWORDS) 
-            WITH keys(s) AS properties, s 
-            UNWIND properties AS property 
-            WITH s, property 
-            WHERE property STARTS WITH 'tags_ids' 
+            MATCH (s:{self.project_name}:STOPWORDS)
+            WITH keys(s) AS properties, s
+            UNWIND properties AS property
+            WITH s, property
+            WHERE property STARTS WITH 'tags_ids'
             RETURN property AS tags_ids_lc, s[property] AS stopwords
         """
         result = await get_current_transaction().run(query)

From 5f4ce9558fc0f0d149fed5aca52755b8966ab6a9 Mon Sep 17 00:00:00 2001
From: Charles Perier <ch4rles.perier@gmail.com>
Date: Thu, 1 Feb 2024 17:59:49 +0100
Subject: [PATCH 3/3] refactor to resolve comments

---
 backend/editor/entries.py                     | 9 ++++-----
 parser/openfoodfacts_taxonomy_parser/utils.py | 5 ++++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/backend/editor/entries.py b/backend/editor/entries.py
index 1e448112..9aba4751 100644
--- a/backend/editor/entries.py
+++ b/backend/editor/entries.py
@@ -439,7 +439,7 @@ async def get_children(self, entry):
         result = await get_current_transaction().run(query, {"id": entry})
         return await async_list(result)
 
-    async def get_stopwords_dict(self):
+    async def get_stopwords_dict(self) -> dict[str, list[str]]:
         """
         Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary
         where the keys are the language codes, and the values are the stopwords in the
@@ -455,10 +455,9 @@ async def get_stopwords_dict(self):
         """
         result = await get_current_transaction().run(query)
         records = await async_list(result)
-        stopwords_dict = {}
-        for record in records:
-            language_code = record["tags_ids_lc"].split("_")[-1]
-            stopwords_dict[language_code] = record["stopwords"]
+        stopwords_dict = {
+            record["tags_ids_lc"].split("_")[-1]: record["stopwords"] for record in records
+        }
         return stopwords_dict
 
     async def update_node(self, label, entry, new_node):
diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py
index 0e343ef4..54024922 100644
--- a/parser/openfoodfacts_taxonomy_parser/utils.py
+++ b/parser/openfoodfacts_taxonomy_parser/utils.py
@@ -4,8 +4,11 @@
 import unidecode
 
 
-def normalize_text(line: str, lang="default", char="-", stopwords={}):
+def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: dict[str, list[str]] | None = None) -> str:
     """Normalize a string depending on the language code"""
+    if stopwords is None:
+        stopwords = {}
+
     line = unicodedata.normalize("NFC", line)
 
     # Removing accent