From fa390306f616c1d8a99323b6fee476d65404facf Mon Sep 17 00:00:00 2001 From: Charles Perier Date: Wed, 31 Jan 2024 23:51:09 +0100 Subject: [PATCH 1/3] remove stopwords from normalised tags on node update --- backend/editor/entries.py | 29 +++++++++++++++++-- .../parser/taxonomy_parser.py | 19 ++---------- parser/openfoodfacts_taxonomy_parser/utils.py | 11 ++++++- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/backend/editor/entries.py b/backend/editor/entries.py index 5a712d6d..8bf2aca1 100644 --- a/backend/editor/entries.py +++ b/backend/editor/entries.py @@ -60,13 +60,14 @@ async def create_node(self, label, entry, main_language_code): """ params = {"id": entry} query = [f"""CREATE (n:{self.project_name}:{label})\n"""] + stopwords = await self.get_stopwords_dict() # Build all basic keys of a node if label == "ENTRY": # Normalizing new canonical tag language_code, canonical_tag = entry.split(":", 1) normalised_canonical_tag = parser_utils.normalize_text( - canonical_tag, main_language_code + canonical_tag, main_language_code, stopwords=stopwords ) # Reconstructing and updation of node ID @@ -438,6 +439,27 @@ async def get_children(self, entry): result = await get_current_transaction().run(query, {"id": entry}) return await async_list(result) + async def get_stopwords_dict(self): + """ + Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary + where the keys are the language codes and the values are the stopwords in the corresponding language + """ + query = f""" + MATCH (s:{self.project_name}:STOPWORDS) + WITH keys(s) AS properties, s + UNWIND properties AS property + WITH s, property + WHERE property STARTS WITH 'tags_ids' + RETURN property AS tags_ids_lc, s[property] AS stopwords + """ + result = await get_current_transaction().run(query) + records = await async_list(result) + stopwords_dict = {} + for record in records: + language_code = record["tags_ids_lc"].split("_")[-1] + stopwords_dict[language_code] = record["stopwords"] + return stopwords_dict + async def update_node(self, label, entry, new_node): """ Helper function used for updation of node with given id and label @@ -471,6 +493,7 @@ async def update_node(self, label, entry, new_node): # Adding normalized tags ids corresponding to entry tags normalised_new_node = {} + stopwords = await self.get_stopwords_dict() for key in set(new_node.keys()) - deleted_keys: if key.startswith("tags_"): if "_ids_" not in key: @@ -478,7 +501,9 @@ async def update_node(self, label, entry, new_node): normalised_value = [] for value in new_node[key]: normalised_value.append( - parser_utils.normalize_text(value, keys_language_code) + parser_utils.normalize_text( + value, keys_language_code, stopwords=stopwords + ) ) normalised_new_node[key] = new_node[key] normalised_new_node["tags_ids_" + keys_language_code] = normalised_value diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index ff91f723..bb817259 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -98,19 +98,6 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]] line_count += 1 yield line_count, "" # to end the last entry if not ended - def _remove_stopwords(self, lc: str, words: str) -> str: - """Remove the stopwords that were read at the beginning of the file""" - # First check if this language has stopwords - if lc in self.stopwords: - words_to_remove = self.stopwords[lc] - new_words = [] - for word in words.split("-"): - if word not in words_to_remove: - new_words.append(word) - return ("-").join(new_words) - else: - return words - def _add_line(self, line: str) -> str: """ Get a normalized string but keeping the language code "lc:", @@ -118,7 +105,7 @@ def _add_line(self, line: str) -> str: """ lc, line = line.split(":", 1) new_line = lc + ":" - new_line += self._remove_stopwords(lc, normalize_text(line, lc)) + new_line += normalize_text(line, lc, stopwords=self.stopwords) return new_line def _get_lc_value(self, line: str) -> tuple[str, list[str]]: @@ -126,7 +113,7 @@ def _get_lc_value(self, line: str) -> tuple[str, list[str]]: lc, line = line.split(":", 1) new_line: list[str] = [] for word in line.split(","): - new_line.append(self._remove_stopwords(lc, normalize_text(word, lc))) + new_line.append(normalize_text(word, lc, stopwords=self.stopwords)) return lc, new_line def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData: @@ -291,7 +278,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N tagsids_list = [] for word in line.split(","): tags_list.append(word.strip()) - word_normalized = self._remove_stopwords(lang, normalize_text(word, lang)) + word_normalized = normalize_text(word, lang, stopwords=self.stopwords) if word_normalized not in tagsids_list: # in case 2 normalized synonyms are the same tagsids_list.append(word_normalized) diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py index 8b942186..0e343ef4 100644 --- a/parser/openfoodfacts_taxonomy_parser/utils.py +++ b/parser/openfoodfacts_taxonomy_parser/utils.py @@ -4,7 +4,7 @@ import unidecode -def normalize_text(line: str, lang="default", char="-"): +def normalize_text(line: str, lang="default", char="-", stopwords={}): """Normalize a string depending on the language code""" line = unicodedata.normalize("NFC", line) @@ -29,6 +29,15 @@ def normalize_text(line: str, lang="default", char="-"): # Removing excess "-" line = re.sub(r"-+", char, line) line = line.strip(char) + + # Remove stopwords + if lang in stopwords: + stopwords = stopwords[lang] + line_surrounded_by_char = char + line + char + for stopword in stopwords: + line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char) + line = line_surrounded_by_char[1:-1] + return line From e7f14b100a84b51efb305a09f9d0cd3855a223ef Mon Sep 17 00:00:00 2001 From: Charles Perier Date: Thu, 1 Feb 2024 00:02:21 +0100 Subject: [PATCH 2/3] format --- backend/editor/entries.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/backend/editor/entries.py b/backend/editor/entries.py index 8bf2aca1..1e448112 100644 --- a/backend/editor/entries.py +++ b/backend/editor/entries.py @@ -442,14 +442,15 @@ async def get_children(self, entry): async def get_stopwords_dict(self): """ Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary - where the keys are the language codes and the values are the stopwords in the corresponding language + where the keys are the language codes, and the values are the stopwords in the + corresponding language """ query = f""" - MATCH (s:{self.project_name}:STOPWORDS) - WITH keys(s) AS properties, s - UNWIND properties AS property - WITH s, property - WHERE property STARTS WITH 'tags_ids' + MATCH (s:{self.project_name}:STOPWORDS) + WITH keys(s) AS properties, s + UNWIND properties AS property + WITH s, property + WHERE property STARTS WITH 'tags_ids' RETURN property AS tags_ids_lc, s[property] AS stopwords """ result = await get_current_transaction().run(query) From 5f4ce9558fc0f0d149fed5aca52755b8966ab6a9 Mon Sep 17 00:00:00 2001 From: Charles Perier Date: Thu, 1 Feb 2024 17:59:49 +0100 Subject: [PATCH 3/3] refactor to resolve comments --- backend/editor/entries.py | 9 ++++----- parser/openfoodfacts_taxonomy_parser/utils.py | 5 ++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/backend/editor/entries.py b/backend/editor/entries.py index 1e448112..9aba4751 100644 --- a/backend/editor/entries.py +++ b/backend/editor/entries.py @@ -439,7 +439,7 @@ async def get_children(self, entry): result = await get_current_transaction().run(query, {"id": entry}) return await async_list(result) - async def get_stopwords_dict(self): + async def get_stopwords_dict(self) -> dict[str, list[str]]: """ Helper function used for getting all stopwords in a taxonomy, in the form of a dictionary where the keys are the language codes, and the values are the stopwords in the @@ -455,10 +455,9 @@ async def get_stopwords_dict(self): """ result = await get_current_transaction().run(query) records = await async_list(result) - stopwords_dict = {} - for record in records: - language_code = record["tags_ids_lc"].split("_")[-1] - stopwords_dict[language_code] = record["stopwords"] + stopwords_dict = { + record["tags_ids_lc"].split("_")[-1]: record["stopwords"] for record in records + } return stopwords_dict async def update_node(self, label, entry, new_node): diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py index 0e343ef4..54024922 100644 --- a/parser/openfoodfacts_taxonomy_parser/utils.py +++ b/parser/openfoodfacts_taxonomy_parser/utils.py @@ -4,8 +4,11 @@ import unidecode -def normalize_text(line: str, lang="default", char="-", stopwords={}): +def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: dict[str, list[str]] | None = None) -> str: """Normalize a string depending on the language code""" + if stopwords is None: + stopwords = {} + line = unicodedata.normalize("NFC", line) # Removing accent