From 5067e15138e57457605660519abc42774896cb6c Mon Sep 17 00:00:00 2001 From: Kristoffer Andersson Date: Wed, 13 Nov 2024 15:32:46 +0100 Subject: [PATCH] feat: connect isolated tokens based on index --- src/parallel_corpus/graph.py | 22 ++++++++++++++++++++++ tests/test_graph.py | 10 ++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/parallel_corpus/graph.py b/src/parallel_corpus/graph.py index 0ccfb2f..8b80a5b 100644 --- a/src/parallel_corpus/graph.py +++ b/src/parallel_corpus/graph.py @@ -1,5 +1,6 @@ """Parallel corpus as a graph.""" +import copy import itertools import logging import re @@ -235,6 +236,27 @@ def target_text(g: SourceTarget[List[text_token.Text]]) -> str: return text_token.text(g.target) +def connect_isolated_tokens_based_on_index(g: Graph) -> Graph: + g_new = copy.deepcopy(g) + isolated_source_edges = [ + e for e in g_new.edges.values() if len(e.ids) == 1 and e.ids[0].startswith("s") + ] + isolated_target_edges = [ + e for e in g_new.edges.values() if len(e.ids) == 1 and e.ids[0].startswith("t") + ] + for s_edge in isolated_source_edges: + needle = s_edge.ids[0].replace("s", "t") + t_edge = next(filter(lambda t: t.ids[0] == needle, isolated_target_edges), None) + if t_edge: + del g_new.edges[s_edge.id] + del g_new.edges[t_edge.id] + + new_edge = merge_edges(s_edge, t_edge) + g_new.edges[new_edge.id] = new_edge + + return g_new + + @dataclass class CharIdPair: char: str diff --git a/tests/test_graph.py b/tests/test_graph.py index 900b8c0..a514034 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -305,3 +305,13 @@ def test_unaligned_rearrange() -> None: # target_text(unaligned_rearrange(init(), 1, 2, 0)) // => + + +def test_connecting_token_based_on_index() -> None: + original_text = "Den 24 maj: JfS 404: pä ansökan af handlanden E. G. Petersson i Mönstcräs. Den 25,maj: M 383: pá ansökan as" # noqa: E501 + generated_text = "Den 24 maj: № 404: på ansökan af handlanden E. G. Petersson i Mönsterås. Den 25 maj: № 383: på ansökan af" # noqa: E501 + + g = graph.init_with_source_and_target(original_text, generated_text) + print(f"{g=}") + g_new = graph.connect_isolated_tokens_based_on_index(g) + assert "e-s3-t3" in g_new.edges