Skip to content

Commit

Permalink
feat: connect isolated tokens based on index
Browse files Browse the repository at this point in the history
  • Loading branch information
kod-kristoff committed Nov 13, 2024
1 parent 9cbd2bc commit 5067e15
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
22 changes: 22 additions & 0 deletions src/parallel_corpus/graph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Parallel corpus as a graph."""

import copy
import itertools
import logging
import re
Expand Down Expand Up @@ -235,6 +236,27 @@ def target_text(g: SourceTarget[List[text_token.Text]]) -> str:
return text_token.text(g.target)


def connect_isolated_tokens_based_on_index(g: Graph) -> Graph:
g_new = copy.deepcopy(g)
isolated_source_edges = [
e for e in g_new.edges.values() if len(e.ids) == 1 and e.ids[0].startswith("s")
]
isolated_target_edges = [
e for e in g_new.edges.values() if len(e.ids) == 1 and e.ids[0].startswith("t")
]
for s_edge in isolated_source_edges:
needle = s_edge.ids[0].replace("s", "t")
t_edge = next(filter(lambda t: t.ids[0] == needle, isolated_target_edges), None)
if t_edge:
del g_new.edges[s_edge.id]
del g_new.edges[t_edge.id]

new_edge = merge_edges(s_edge, t_edge)
g_new.edges[new_edge.id] = new_edge

return g_new


@dataclass
class CharIdPair:
char: str
Expand Down
10 changes: 10 additions & 0 deletions tests/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,13 @@ def test_unaligned_rearrange() -> None:


# target_text(unaligned_rearrange(init(), 1, 2, 0)) // =>


def test_connecting_token_based_on_index() -> None:
original_text = "Den 24 maj: JfS 404: pä ansökan af handlanden E. G. Petersson i Mönstcräs. Den 25,maj: M 383: pá ansökan as" # noqa: E501
generated_text = "Den 24 maj: № 404: på ansökan af handlanden E. G. Petersson i Mönsterås. Den 25 maj: № 383: på ansökan af" # noqa: E501

g = graph.init_with_source_and_target(original_text, generated_text)
print(f"{g=}")
g_new = graph.connect_isolated_tokens_based_on_index(g)
assert "e-s3-t3" in g_new.edges

0 comments on commit 5067e15

Please sign in to comment.