Skip to content

Commit

Permalink
feat: wip on:
Browse files Browse the repository at this point in the history
* adding a modified property
* adding lines location of entries
* keeping removed entries

The goal is to be able to patch taxonomy text files instead of re-generating them completely

Relates to: #541 and #366
  • Loading branch information
alexgarel committed Oct 31, 2024
1 parent 3077bae commit b5bcefc
Show file tree
Hide file tree
Showing 8 changed files with 89 additions and 31 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/github-projects-for-openfoodfacts-design.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,13 @@ jobs:
project-url: https://github.com/orgs/openfoodfacts/projects/5 # Add issue to the folksonomy project
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: 🏷️ Folksonomy Project
label-operator: OR
label-operator: OR
- uses: actions/add-to-project@main
with:
project-url: https://github.com/orgs/openfoodfacts/projects/44 # Add issue to the data quality project
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: 🧽 Data quality
label-operator: OR
label-operator: OR
- uses: actions/add-to-project@main
with:
project-url: https://github.com/orgs/openfoodfacts/projects/82 # Add issue to the search project
Expand All @@ -77,19 +77,19 @@ jobs:
project-url: https://github.com/orgs/openfoodfacts/projects/41 # Add issue to the producer platform project
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: 🏭 Producers Platform
label-operator: OR
label-operator: OR
- uses: actions/add-to-project@main
with:
project-url: https://github.com/orgs/openfoodfacts/projects/19 # Add issue to the infrastructure project
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: infrastructure
label-operator: OR
label-operator: OR
- uses: actions/add-to-project@main
with:
project-url: https://github.com/orgs/openfoodfacts/projects/92 # Add issue to the Nutri-Score project
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: 🚦 Nutri-Score
label-operator: OR
label-operator: OR
- uses: actions/add-to-project@main
with:
project-url: https://github.com/orgs/openfoodfacts/projects/132 # Add issue to the Top upvoted issues board
Expand All @@ -107,4 +107,4 @@ jobs:
project-url: https://github.com/orgs/openfoodfacts/projects/35 # Add issue to the ♿️ accessibility project
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
labeled: ♿️ accessibility
label-operator: OR
label-operator: OR
2 changes: 1 addition & 1 deletion backend/editor/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ async def find_all_nodes(response: Response, branch: str, taxonomy_name: str):
Get all nodes within taxonomy
"""
taxonomy = TaxonomyGraph(branch, taxonomy_name)
all_nodes = await taxonomy.get_all_nodes("")
all_nodes = await taxonomy.get_all_nodes()
return all_nodes


Expand Down
1 change: 1 addition & 0 deletions backend/editor/controllers/node_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ async def create_entry_node(
"id": language_code + ":" + normalized_name,
f"tags_{language_code}": [name],
f"tags_ids_{language_code}": [normalized_name],
f"modified": datetime.datetime.now().timestamp(),
}
params = {"entry_node": entry_node_data}

Expand Down
63 changes: 46 additions & 17 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ async def list_projects(self, status=None):

async def add_node_to_end(self, label, entry):
"""
Helper function which adds an existing node to end of taxonomy
Helper function which adds an a newly created node to end of taxonomy
"""
# Delete relationship between current last node and __footer__
query = f"""
Expand Down Expand Up @@ -394,28 +394,37 @@ async def add_node_to_beginning(self, label, entry):
async def delete_node(self, label, entry):
"""
Helper function used for deleting a node with given id and label
We don't really delete it because we have to keep track of modified nodes.
We set the entry type lable to REMOVED_<label>
"""
# Finding node to be deleted using node ID
# Remove node from is_before relation and attach node previous node to next node
query = f"""
// Find node to be deleted using node ID
MATCH (deleted_node:{self.project_name}:{label})-[:is_before]->(next_node)
WHERE deleted_node.id = $id
MATCH (previous_node)-[:is_before]->(deleted_node)
// Remove node
DETACH DELETE (deleted_node)
DETACH (deleted_node)
// Rebuild relationships after deletion
CREATE (previous_node)-[:is_before]->(next_node)
"""
# change label of node to be deleted
query = f"""
MATCH (deleted_node:{self.project_name}:{label}) WHERE deleted_node.id = $id
REMOVE deleted_node:{label}
SET deleted_node:REMOVED_{label}
"""
result = await get_current_transaction().run(query, {"id": entry})
return await async_list(result)

async def get_all_nodes(self, label):
"""
Helper function used for getting all nodes with/without given label
"""
qualifier = f":{label}" if label else ""
qualifier = f"{label}" if label else "|".join(label.value for label in NodeType)
query = f"""
MATCH (n:{self.project_name}{qualifier}) RETURN n
MATCH (n:{self.project_name}:{qualifier}) RETURN n
"""
result = await get_current_transaction().run(query)
return await async_list(result)
Expand Down Expand Up @@ -522,6 +531,9 @@ async def update_node(self, label, new_node: EntryNode):
# Build query
query = [f"""MATCH (n:{self.project_name}:{label}) WHERE n.id = $id """]

modified = datetime.datetime.now().timestamp()
query.append(f"""\nSET n.modified * ${modified}""")

# Delete keys removed by user
deleted_keys = (
(set(curr_node.tags.keys()) - set(new_node.tags.keys()))
Expand Down Expand Up @@ -558,24 +570,33 @@ async def update_node_children(self, entry, new_children_ids):
"""
Helper function used for updation of node children with given id
"""
modified = datetime.datetime.now().timestamp()
# Parse node ids from Neo4j Record object
current_children = [record["child.id"] for record in list(await self.get_children(entry))]
deleted_children = set(current_children) - set(new_children_ids)
added_children = set(new_children_ids) - set(current_children)

# Delete relationships
for child in deleted_children:
query = f"""
MATCH
(deleted_child:{self.project_name}:ENTRY)
-[rel:is_child_of]->
(parent:{self.project_name}:ENTRY)
WHERE parent.id = $id AND deleted_child.id = $child
DELETE rel
"""
await get_current_transaction().run(query, {"id": entry, "child": child})
query = f"""
MATCH
(deleted_child:{self.project_name}:ENTRY)
-[rel:is_child_of]->
(parent:{self.project_name}:ENTRY)
WHERE parent.id = $id AND deleted_child.id IN $children
DELETE rel
"""
await get_current_transaction().run(query, {"id": entry, "children": deleted_children})
# update children modified property
query = f"""
MATCH (child:{self.project_name}:ENTRY)
WHERE child.id in $children
SET child.modified = $modified
"""
await get_current_transaction().run(
query, {"children": deleted_children, "modified": modified}
)

# Create non-existing nodes
# get non-existing nodes
query = f"""
MATCH (child:{self.project_name}:ENTRY)
WHERE child.id in $ids RETURN child.id
Expand All @@ -586,7 +607,7 @@ async def update_node_children(self, entry, new_children_ids):

# Normalising new children node ID
created_child_ids = []

# create new nodes
for child in to_create:
main_language_code, child_name = child.split(":", 1)
created_node_id = await self.create_entry_node(child_name, main_language_code)
Expand All @@ -612,5 +633,13 @@ async def update_node_children(self, entry, new_children_ids):
query, {"id": entry, "child_id": child_id}
)
result = list(await _result.value())
# update modified of existing but added children entries
# update children modified property
query = f"""
MATCH (child:{self.project_name}:ENTRY)
WHERE child.id in $children
SET child.modified = $modified
"""
await get_current_transaction().run(query, {"children": existing_ids, "modified": modified})

return result
21 changes: 17 additions & 4 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label
base_properties_query = """
id: $id,
preceding_lines: $preceding_lines,
src_position: $src_position
src_position: $src_position,
src_lines: $src_lines
"""

properties_query = ",\n".join([base_properties_query, *node_tags_queries])
Expand Down Expand Up @@ -228,7 +229,8 @@ def _create_node_indexes(self, project_label: str):

self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")

def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
def _write_nodes_to_db(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
"""Create the taxonomy objects in the database: nodes and link between them"""
project_label = get_project_name(taxonomy_name, branch_name)
# First create nodes,
# then create node indexes to accelerate relationship creation,
Expand All @@ -239,6 +241,16 @@ def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name
self._create_child_links(taxonomy.child_links, project_label)
self._create_previous_links(taxonomy.previous_links, project_label)

def _add_text_to_project(self, filename: str, taxonomy_name: str, branch_name: str):
"""Add file content to the db"""
project_label = get_project_name(taxonomy_name, branch_name)
query = f"""
MATCH (n:{project_label})
SET n.original_text = $original_text
"""
params = {"original_text": open(filename, "r", encoding="utf-8").read()}
self.session.run(query, params)

def __call__(
self,
main_filename: str,
Expand All @@ -256,8 +268,9 @@ def __call__(
taxonomy = taxonomy_parser.parse_file(
main_filename, external_filenames, self.parser_logger
)

self._write_to_database(taxonomy, taxonomy_name, branch_name)
# add file content to the db
self._add_text_to_project(main_filename, taxonomy_name, branch_name)
self._write_nodes_to_db(taxonomy, taxonomy_name, branch_name)

self.parser_logger.info(
f"Finished parsing {taxonomy_name} in {timeit.default_timer() - start_time} seconds"
Expand Down
18 changes: 15 additions & 3 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class NodeData:
preceding_lines: list[str] = field(default_factory=list)
parent_tags: list[tuple[str, int]] = field(default_factory=list)
src_position: int | None = None
# lines taken by this entry in the source file
# this can be more than (start, end) if we merged duplicates
src_lines: list[tuple[int, int]] | None = None
properties: dict[str, str] = field(default_factory=dict)
tags: dict[str, list[str]] = field(default_factory=dict)
comments: dict[str, list[str]] = field(default_factory=dict)
Expand All @@ -39,6 +42,7 @@ def to_dict(self):
"main_language": self.main_language,
"preceding_lines": self.preceding_lines,
"src_position": self.src_position,
"src_lines": self.src_lines,
"is_external": self.is_external,
"original_taxonomy": self.original_taxonomy,
**self.properties,
Expand Down Expand Up @@ -230,8 +234,9 @@ def is_entry_synonyms_line(self, line):
)
return False

def finalize_data(self, data, comments, saved_nodes):
def finalize_data(self, data, comments, saved_nodes, line_number: int):
data = self._remove_separating_line(data)
data.src_lines = [(data.src_position, line_number)]
if data.get_node_type() == NodeType.ENTRY:
self._add_comments(data, comments, "end")
if data.id in saved_nodes:
Expand Down Expand Up @@ -376,7 +381,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
# another function will use data to create a node
if previous_data is not None:
yield previous_data # it's ok with this one
previous_data = self.finalize_data(data, comments, saved_nodes)
previous_data = self.finalize_data(data, comments, saved_nodes, line_number)
# if data was a duplicate (is_before is None) reuse same is_before
is_before = data.id if data.is_before else is_before
data = NodeData(is_before=is_before)
Expand Down Expand Up @@ -562,6 +567,8 @@ def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[Node
)
# union of the preceding_lines comments
first_node.preceding_lines.extend(node.preceding_lines)
# union of src_lines
first_node.src_lines.extend(node.src_lines)
else:
unique_entry_nodes.append(node)
ids_to_nodes[node.id] = node
Expand All @@ -587,7 +594,12 @@ def _create_taxonomy(
entry_nodes: list[NodeData] = []
entry_nodes.extend(external_entry_nodes)
other_nodes = [
NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1)
NodeData(
id="__header__",
preceding_lines=harvested_header_data,
src_position=1,
src_lines=[(1, entries_start_line - 1)],
)
]
previous_links: list[PreviousLink] = []
raw_child_links: list[ChildLink] = []
Expand Down
1 change: 1 addition & 0 deletions parser/openfoodfacts_taxonomy_parser/patcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FIXME
2 changes: 2 additions & 0 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""This module takes a taxonomy from a Neo4j database and write it down as a text file"""

import os
import sys

Expand Down

0 comments on commit b5bcefc

Please sign in to comment.