Skip to content

Commit

Permalink
feat(parser,frontend,backend): conserve comments order and sort prope…
Browse files Browse the repository at this point in the history
…rties at export (#364)

* feat: conserve comments order

* remove unused import

* sort properties at export and fix tests

* avoid storing comments with empty lists

* fix update_node controller to remove comments when needed

* handle prop comments and potential issues

* refactor and make pure functions

* fix and format

* hide comments in properties table, and fix set changing size during iteration
  • Loading branch information
perierc authored Feb 7, 2024
1 parent 1304822 commit 780ad2a
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 88 deletions.
41 changes: 26 additions & 15 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,12 @@ def is_valid_branch_name(self):
"""
return parser_utils.normalize_text(self.branch_name, char="_") == self.branch_name

def is_tag_key(self, key):
"""
Helper function to check if a key is a tag key (tags_XX)
"""
return key.startswith("tags_") and not ("_ids_" in key or key.endswith("_comments"))

async def list_projects(self, status=None):
"""
Helper function for listing all existing projects created in Taxonomy Editor
Expand Down Expand Up @@ -494,12 +500,21 @@ async def update_node(self, label, entry, new_node):
curr_node = result[0]["n"]
deleted_keys = set(curr_node.keys()) - set(new_node.keys())

# Check for deleted keys having associated comments and delete them too
comments_keys_to_delete = set()
for key in deleted_keys:
comments_key = key + "_comments"
if new_node.get(comments_key) is not None:
comments_keys_to_delete.add(comments_key)
deleted_keys = deleted_keys.union(comments_keys_to_delete)

# Check for keys having null/empty values
for key in new_node.keys():
if ((new_node[key] == []) or (new_node[key] is None)) and key != "preceding_lines":
deleted_keys.add(key)
deleted_keys.add(key + "_comments")
# Delete tags_ids if we delete tags of a language
if key.startswith("tags_") and "_ids_" not in key:
if self.is_tag_key(key):
deleted_keys.add("tags_ids_" + key.split("_", 1)[1])

# Build query
Expand All @@ -515,20 +530,16 @@ async def update_node(self, label, entry, new_node):
normalised_new_node = {}
stopwords = await self.get_stopwords_dict()
for key in set(new_node.keys()) - deleted_keys:
if key.startswith("tags_"):
if "_ids_" not in key:
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in new_node[key]:
normalised_value.append(
parser_utils.normalize_text(
value, keys_language_code, stopwords=stopwords
)
)
normalised_new_node[key] = new_node[key]
normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
else:
pass # We generate tags_ids, and ignore the one sent
if self.is_tag_key(key):
# Normalise tags and store them in tags_ids
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in new_node[key]:
normalised_value.append(
parser_utils.normalize_text(value, keys_language_code, stopwords=stopwords)
)
normalised_new_node[key] = new_node[key]
normalised_new_node["tags_ids_" + keys_language_code] = normalised_value
else:
# No need to normalise
normalised_new_node[key] = new_node[key]
Expand Down
9 changes: 5 additions & 4 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,17 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
# we don't know in advance which properties and tags
# we will encounter in the batch
# so we accumulate them in this set
seen_properties_and_tags = set()
seen_properties_and_tags_and_comments = set()

for entry_node in entry_nodes:
if entry_node.get_node_type() != NodeType.ENTRY:
raise ValueError(f"Only ENTRY nodes should be passed to this function")
seen_properties_and_tags.update(entry_node.tags)
seen_properties_and_tags.update(entry_node.properties)
seen_properties_and_tags_and_comments.update(entry_node.tags)
seen_properties_and_tags_and_comments.update(entry_node.properties)
seen_properties_and_tags_and_comments.update(entry_node.comments)

additional_properties_queries = [
f"{key} : entry_node.{key}" for key in seen_properties_and_tags
f"{key} : entry_node.{key}" for key in seen_properties_and_tags_and_comments
]

base_properties_query = f"""
Expand Down
89 changes: 75 additions & 14 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import collections
import copy
import logging
import re
import sys
Expand All @@ -25,10 +26,16 @@ class NodeData:
is_before: str | None = None
main_language: str | None = None
preceding_lines: list[str] = field(default_factory=list)
parent_tag: list[tuple[str, int]] = field(default_factory=list)
parent_tags: list[tuple[str, int]] = field(default_factory=list)
src_position: int | None = None
properties: dict[str, str] = field(default_factory=dict)
tags: dict[str, list[str]] = field(default_factory=dict)
comments: dict[str, list[str]] = field(default_factory=dict)
# comments_stack is a list of tuples (line_number, comment),
# to keep track of comments just above the current line
# during parsing of an entry, to be able to add them
# to the right property or tag when possible
comments_stack: list[(int, str)] = field(default_factory=list)

def to_dict(self):
return {
Expand All @@ -38,6 +45,7 @@ def to_dict(self):
"src_position": self.src_position,
**self.properties,
**self.tags,
**self.comments,
}

def get_node_type(self):
Expand Down Expand Up @@ -151,12 +159,12 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:

def _entry_end(self, line: str, data: NodeData) -> bool:
"""Return True if the block ended"""
# stopwords and synonyms are one-liner, entries are separated by a blank line
if line.startswith("stopwords") or line.startswith("synonyms") or not line:
# can be the end of an block or just additional line separator,
# file_iter() always end with ''
if data.id: # to be sure that it's an end
return True
if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
# stopwords and synonyms are one-liners; if the id is set, it's the end
return True
if not line and data.id:
# entries are separated by a blank line
return True
return False

def _remove_separating_line(self, data: NodeData) -> NodeData:
Expand Down Expand Up @@ -186,6 +194,43 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
data.preceding_lines.pop(0)
return data

def _get_node_data_with_comments_above_key(
self, data: NodeData, line_number: int, key: str
) -> NodeData:
"""Returns the updated node data with comments above the given
key stored in the {key}_comments property."""
new_data = copy.deepcopy(data)

# Get comments just above the given line
comments_above = []
current_line = line_number - 1
while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
comments_above.append(new_data.comments_stack.pop()[1])
current_line -= 1
if comments_above:
new_data.comments[key + "_comments"] = comments_above[::-1]

return new_data

def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
"""Returns the updated node data with parent and end comments"""
new_data = copy.deepcopy(data)

# Get parent comments (part of an entry block and just above/between the parents lines)
parent_comments = []
while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
parent_comments.append(new_data.preceding_lines.pop())
if parent_comments:
new_data.comments["parent_comments"] = parent_comments[::-1]

# Get end comments (part of an entry block after the last tag/prop
# and before the separating blank line)
end_comments = [comment[1] for comment in new_data.comments_stack]
if end_comments:
new_data.comments["end_comments"] = end_comments

return new_data

def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
"""Transform data from file to dictionary"""
saved_nodes = []
Expand Down Expand Up @@ -215,14 +260,21 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
self.parser_logger.error(msg)
else:
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
data = self._get_node_data_with_parent_and_end_comments(data)
yield data # another function will use this dictionary to create a node
saved_nodes.append(data.id)
data = NodeData(is_before=data.id)

# harvest the line
if not (line) or line[0] == "#":
# comment or blank
data.preceding_lines.append(line)
# comment or blank line
if data.id:
# we are within the node definition
data.comments_stack.append((line_number, line))
else:
# we are before the actual node
data.preceding_lines.append(line)
else:
line = line.rstrip(",")
if not data.src_position:
Expand All @@ -234,7 +286,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
index_stopwords += 1
# remove "stopwords:" part
line = line[10:]
# compute raw values outside _get_lc_value as it removes stop words !
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
Expand All @@ -252,7 +304,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
id = "synonyms:" + str(index_synonyms)
data = self._set_data_id(data, id, line_number)
index_synonyms += 1
# remove "synonyms:" part
line = line[9:]
# compute raw values outside _get_lc_value as it normalizes them!
tags = [words.strip() for words in line[3:].split(",")]
try:
lc, value = self._get_lc_value(line)
Expand All @@ -265,7 +319,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
data.tags["tags_ids_" + lc] = value
elif line[0] == "<":
# parent definition
data.parent_tag.append((self._add_line(line[1:]), line_number + 1))
data.parent_tags.append((self._add_line(line[1:]), line_number + 1))
elif language_code_prefix.match(line):
# synonyms definition
if not data.id:
Expand All @@ -286,6 +340,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list.append(word_normalized)
data.tags["tags_" + lang] = tags_list
data.tags["tags_ids_" + lang] = tagsids_list
data = self._get_node_data_with_comments_above_key(
data, line_number, "tags_" + lang
)
else:
# property definition
property_name = None
Expand All @@ -306,7 +363,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'"
)
if property_name:
data.properties["prop_" + property_name + "_" + lc] = property_value
prop_key = "prop_" + property_name + "_" + lc
data.properties[prop_key] = property_value
data = self._get_node_data_with_comments_above_key(
data, line_number, prop_key
)

data.id = "__footer__"
data.preceding_lines.pop(0)
Expand Down Expand Up @@ -395,8 +456,8 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
other_nodes.append(entry)
if entry.is_before:
previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id))
if entry.parent_tag:
for position, (parent, line_position) in enumerate(entry.parent_tag):
if entry.parent_tags:
for position, (parent, line_position) in enumerate(entry.parent_tags):
raw_child_links.append(
ChildLink(
parent_id=parent,
Expand Down
15 changes: 10 additions & 5 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ def get_tags_line(self, node, lc):
return lc + ":" + line

def list_property_and_lc(self, node):
"""return a ordered list of properties with their language code (lc)"""
"""return an ordered list of properties with their language code (lc)"""
# there is no rule for the order of properties
# properties will be arranged in alphabetical order
return [property[5:] for property in node if property.startswith("prop_")]
return sorted([property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")])

def get_property_line(self, node, property):
"""return a string that should look like the original property line"""
Expand Down Expand Up @@ -90,28 +90,33 @@ def iter_lines(self, project_label):
if add_blank:
yield ""
# comments
if node["preceding_lines"]:
yield from node["preceding_lines"]
yield from node.get("preceding_lines", [])
if has_content:
tags_lc = self.list_tags_lc(node)
if node["id"].startswith("stopwords"):
yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
elif node["id"].startswith("synonyms"):
yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
else:
# synonyms line
# parents
yield from node.get("parent_comments", [])
yield from self.get_parents_lines(parents)
# main language synonyms first
main_language = node.pop("main_language")
tags_lc.remove(main_language)
yield from node.get("tags_" + main_language + "_comments", [])
yield self.get_tags_line(node, main_language)
# more synonyms after
for lc in tags_lc:
yield from node.get("tags_" + lc + "_comments", [])
yield self.get_tags_line(node, lc)
# properties
properties_list = self.list_property_and_lc(node)
for property in properties_list:
yield from node.get("prop_" + property + "_comments", [])
yield self.get_property_line(node, property)
# final comments
yield from node.get("end_comments", [])
previous_block_id = node["id"]

def rewrite_file(self, filename, lines):
Expand Down
2 changes: 1 addition & 1 deletion parser/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ fr:yaourts au fruit de la passion allégés
# meat

en:meat
vegan:en:no
carbon_footprint_fr_foodges_value:fr:10
vegan:en:no

<en:meat
en:fake-meat
Expand Down
2 changes: 1 addition & 1 deletion parser/tests/unit/test_parser_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_fileiter(neo4j):
file_iterator = taxonomy_parser._file_iter(TEST_TAXONOMY_TXT)
for counter, (_, line) in enumerate(file_iterator):
assert line == "" or line[0] == "#" or ":" in line
if counter == 26:
if counter == 25:
assert line == "carbon_footprint_fr_foodges_value:fr:10"
assert counter == 37

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Box, Grid, Paper, TextField, Typography } from "@mui/material";
import { Box, Grid, Paper, Typography } from "@mui/material";
import MaterialTable, { MTableToolbar } from "@material-table/core";
import { useEffect, useState } from "react";

Expand All @@ -14,7 +14,11 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
// UUID of properties will have a "_uuid" suffix
// Ex: prop_vegan_en_uuid

if (key.startsWith("prop") && key.endsWith("uuid")) {
if (
key.startsWith("prop") &&
key.endsWith("uuid") &&
!key.endsWith("_comments_uuid")
) {
const uuid = nodeObject[key][0]; // UUID
// Removing "prop_" prefix from key to render only the name
const property_name = key.split("_").slice(1, -1).join("_");
Expand All @@ -33,13 +37,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {
setData(renderedProperties);
}, [nodeObject]);

// Helper function used for changing comments from node
const changeCommentData = (value) => {
const newNodeObject = { ...nodeObject };
newNodeObject["preceding_lines"] = value;
setNodeObject(newNodeObject);
};

// Helper function used for changing properties of node
const changePropertyData = (key, value) => {
setNodeObject((prevState) => {
Expand All @@ -60,21 +57,6 @@ const ListAllEntryProperties = ({ nodeObject, setNodeObject }) => {

return (
<Box>
{/* Comments */}
<Typography sx={{ ml: 4, mt: 2, mb: 1 }} variant="h5">
Comments
</Typography>
<TextField
sx={{ ml: 8, mt: 1, width: 250 }}
minRows={4}
multiline
onChange={(event) => {
changeCommentData(event.target.value.split("\n"));
}}
value={nodeObject?.preceding_lines.join("\n")}
variant="outlined"
/>

{/* Properties */}
<Box sx={{ width: "50%", ml: 4 }}>
<MaterialTable
Expand Down
Loading

0 comments on commit 780ad2a

Please sign in to comment.