openfoodfacts · perierc · Jan 24, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 24, 2024
@@ -243,15 +243,18 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     id = "stopwords:" + str(index_stopwords)
                     data = self._set_data_id(data, id, line_number)
                     index_stopwords += 1
+                    line = line[10:]
+                    tags = [words.strip() for words in line[3:].split(",")]
                     try:
-                        lc, value = self._get_lc_value(line[10:])
+                        lc, value = self._get_lc_value(line)
                     except ValueError:
                         self.parser_logger.error(
                             f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
                         )
                     else:
-                        data.tags["tags_" + lc] = value
-                        # add the list with its lc
+                        data.tags["tags_" + lc] = tags
+                        data.tags["tags_ids_" + lc] = value
+                        # add the normalized list with its lc
                         self.stopwords[lc] = value
                 elif line.startswith("synonyms"):
                     # general synonyms definition for a language

@@ -39,14 +39,9 @@ def get_all_nodes(self, project_label):
     def list_tags_lc(self, node):
         """return an ordered list of the language codes (lc) used in a node"""
         lc_list = []
-        if "stopwords" in node["id"]:
-            # stopwords node only have a tags_lc property
-            key = "tags_"
-            # number of dashes to split on to get language code
-            dash_before_lc = 1
-        else:
-            key = "tags_ids_"
-            dash_before_lc = 2
+        key = "tags_ids_"
+        # number of dashes to split on to get language code
+        dash_before_lc = 2
 
         for property in node:
             if property.startswith(key):

@@ -1,6 +1,6 @@
 # test taxonomy
 
-stopwords:fr: aux,au,de,le,du,la,a,et
+stopwords:fr: aux,au,de,le,du,la,a,et,test normalisation
 
 synonyms:en:passion fruit, passionfruit
 

@@ -56,7 +56,7 @@ def test_round_trip(neo4j):
     for line in original_lines:
         # first tweak: spaces between stopwords
         if line.startswith("stopwords:fr: aux"):
-            line = "stopwords:fr:aux, au, de, le, du, la, a, et"
+            line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
         elif line.startswith("<fr:yaourts fruit de la passion"):
             line = "<en:Passion fruit yogurts"
@@ -98,7 +98,7 @@ def test_two_branch_round_trip(neo4j):
     for line in original_lines:
         # first tweak: spaces between stopwords
         if line.startswith("stopwords:fr: aux"):
-            line = "stopwords:fr:aux, au, de, le, du, la, a, et"
+            line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
         elif line.startswith("<fr:yaourts fruit de la passion"):
             line = "<en:Passion fruit yogurts"

@@ -69,7 +69,8 @@ def test_calling(neo4j):
         results = session.run(query)
         expected_stopwords = {
             "id": "stopwords:0",
-            "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et"],
+            "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test normalisation"],
+            "tags_ids_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test-normalisation"],
             "preceding_lines": [],
         }
         for result in results: