From 17fcd1aa1d16a706151b724fcb94c16b6b22bf79 Mon Sep 17 00:00:00 2001 From: Charles Perier Date: Fri, 19 Jan 2024 09:32:44 +0100 Subject: [PATCH 1/2] store raw stopwords and rewrite them at export --- .../parser/taxonomy_parser.py | 9 ++++++--- parser/openfoodfacts_taxonomy_parser/unparser.py | 11 +++-------- parser/tests/data/test.txt | 2 +- .../integration/test_parse_unparse_integration.py | 4 ++-- parser/tests/integration/test_parser_integration.py | 3 ++- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 881fdd6f..2d112a94 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -247,15 +247,18 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N id = "stopwords:" + str(index_stopwords) data = self._set_data_id(data, id, line_number) index_stopwords += 1 + line = line[10:] + tags = [words.strip() for words in line[3:].split(",")] try: - lc, value = self._get_lc_value(line[10:]) + lc, value = self._get_lc_value(line) except ValueError: self.parser_logger.error( f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'" ) else: - data.tags["tags_" + lc] = value - # add the list with its lc + data.tags["tags_" + lc] = tags + data.tags["tags_ids_" + lc] = value + # add the normalized list with its lc self.stopwords[lc] = value elif line.startswith("synonyms"): # general synonyms definition for a language diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index d18af3f7..77ff7ef9 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -52,14 +52,9 @@ def get_all_nodes(self, multi_label): def list_tags_lc(self, node): """return an ordered list of the language codes (lc) used in a node""" lc_list = [] - if "stopwords" in node["id"]: - # stopwords node only have a tags_lc property - key = "tags_" - # number of dashes to split on to get language code - dash_before_lc = 1 - else: - key = "tags_ids_" - dash_before_lc = 2 + key = "tags_ids_" + # number of dashes to split on to get language code + dash_before_lc = 2 for property in node: if property.startswith(key): diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index 5ad4eba3..5837bf08 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -1,6 +1,6 @@ # test taxonomy -stopwords:fr: aux,au,de,le,du,la,a,et +stopwords:fr: aux,au,de,le,du,la,a,et,test normalisation synonyms:en:passion fruit, passionfruit diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py index 55f03e63..d1541b3f 100644 --- a/parser/tests/integration/test_parse_unparse_integration.py +++ b/parser/tests/integration/test_parse_unparse_integration.py @@ -56,7 +56,7 @@ def test_round_trip(neo4j): for line in original_lines: # first tweak: spaces between stopwords if line.startswith("stopwords:fr: aux"): - line = "stopwords:fr:aux, au, de, le, du, la, a, et" + line = "stopwords:fr:aux, au, de, le, du, la, a, et, test normalisation" # second tweak: renaming parent elif line.startswith(" Date: Wed, 24 Jan 2024 11:13:47 +0100 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Alex Garel --- parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index a6c065ff..ff91f723 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -243,7 +243,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N id = "stopwords:" + str(index_stopwords) data = self._set_data_id(data, id, line_number) index_stopwords += 1 + # remove "stopwords:" part line = line[10:] + # compute raw values outside _get_lc_value as it removes stop words ! tags = [words.strip() for words in line[3:].split(",")] try: lc, value = self._get_lc_value(line)