From 1e2eb6e3383b9c9329d81cdf99c9b7f6460f13d7 Mon Sep 17 00:00:00 2001 From: Kristoffer Andersson Date: Tue, 22 Oct 2024 14:42:23 +0200 Subject: [PATCH] refactor: fix type errors --- src/parallel_corpus/graph.py | 34 ++++++------- src/parallel_corpus/shared/dicts.py | 2 +- .../{token.py => text_token.py} | 0 tests/test_graph.py | 50 +++++++++---------- tests/test_shared/test_lists.py | 5 +- tests/test_token.py | 2 +- 6 files changed, 47 insertions(+), 46 deletions(-) rename src/parallel_corpus/{token.py => text_token.py} (100%) diff --git a/src/parallel_corpus/graph.py b/src/parallel_corpus/graph.py index f083907..bbdd326 100644 --- a/src/parallel_corpus/graph.py +++ b/src/parallel_corpus/graph.py @@ -9,11 +9,11 @@ import parallel_corpus.shared.ranges import parallel_corpus.shared.str_map import parallel_corpus.shared.union_find -from parallel_corpus import shared, token +from parallel_corpus import shared, text_token from parallel_corpus.shared import dicts, diffs, ids, lists from parallel_corpus.shared.unique_check import UniqueCheck from parallel_corpus.source_target import Side, SourceTarget, map_sides -from parallel_corpus.token import Token +from parallel_corpus.text_token import Token A = TypeVar("A") B = TypeVar("B") @@ -84,20 +84,20 @@ def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]: # noqa: D103 def init(s: str, *, manual: bool = False) -> Graph: # noqa: D103 - return init_from(token.tokenize(s), manual=manual) + return init_from(text_token.tokenize(s), manual=manual) def init_with_source_and_target(source: str, target: str, *, manual: bool = False) -> Graph: # noqa: D103 return init_from_source_and_target( - source=token.tokenize(source), target=token.tokenize(target), manual=manual + source=text_token.tokenize(source), target=text_token.tokenize(target), manual=manual ) def init_from(tokens: List[str], *, manual: bool = False) -> Graph: # noqa: D103 return align( Graph( - source=token.identify(tokens, "s"), - target=token.identify(tokens, "t"), + source=text_token.identify(tokens, "s"), + target=text_token.identify(tokens, "t"), edges=edge_record( (edge([f"s{i}", f"t{i}"], [], manual=manual) for i, _ in enumerate(tokens)) ), @@ -108,8 +108,8 @@ def init_from(tokens: List[str], *, manual: bool = False) -> Graph: # noqa: D10 def init_from_source_and_target( # noqa: D103 source: List[str], target: List[str], *, manual: bool = False ) -> Graph: - source_tokens = token.identify(source, "s") - target_tokens = token.identify(target, "t") + source_tokens = text_token.identify(source, "s") + target_tokens = text_token.identify(target, "t") return align( Graph( source=source_tokens, @@ -204,7 +204,7 @@ def align(g: Graph) -> Graph: # noqa: D103 c.a is not None and c.b is not None and c.a.id is not None and c.b.id is not None ): uf.union(c.a.id, c.b.id) - proto_edges = {k: e for k, e in g.edges.items() if e.manual} + proto_edges: Dict[str, Edge] = {k: e for k, e in g.edges.items() if e.manual} first: UniqueCheck[str] = UniqueCheck() def update_edges(tokens, _side) -> None: # noqa: ANN001 @@ -214,7 +214,7 @@ def update_edges(tokens, _side) -> None: # noqa: ANN001 labels = e_repr.labels if first(e_repr.id) else [] e_token = edge([tok.id], labels, manual=False, comment=e_repr.comment) dicts.modify( - proto_edges, + proto_edges, # type: ignore[misc] uf.find(tok.id), zero_edge, lambda e: merge_edges(e, e_token), # noqa: B023 @@ -229,8 +229,8 @@ def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: # noqa: D103 return align(unaligned_rearrange(g, begin, end, dest)) -def target_text(g: SourceTarget[List[token.Text]]) -> str: # noqa: D103 - return token.text(g.target) +def target_text(g: SourceTarget[List[text_token.Text]]) -> str: # noqa: D103 + return text_token.text(g.target) @dataclass @@ -314,23 +314,23 @@ def unaligned_modify( Indexes are character offsets (use CodeMirror's doc.posFromIndex and doc.indexFromPos to convert) """ # noqa: E501 tokens = get_side_texts(g, side) - token_at = token.token_at(tokens, from_) + token_at = text_token.token_at(tokens, from_) from_token, from_ix = token_at["token"], token_at["offset"] pre = (tokens[from_token] if from_token < len(tokens) else "")[:from_ix] if to == len(get_side_text(g, side)): return unaligned_modify_tokens(g, from_token, len(g.get_side(side)), pre + text, side) - to_token_at = token.token_at(tokens, to) + to_token_at = text_token.token_at(tokens, to) to_token, to_ix = to_token_at["token"], to_token_at["offset"] post = (tokens[to_token] or "")[to_ix:] return unaligned_modify_tokens(g, from_token, to_token + 1, pre + text + post, side) def get_side_text(g: Graph, side: Side) -> str: # noqa: D103 - return token.text(g.get_side(side)) + return text_token.text(g.get_side(side)) def get_side_texts(g: Graph, side: Side) -> List[str]: # noqa: D103 - return token.texts(g.get_side(side)) + return text_token.texts(g.get_side(side)) def unaligned_modify_tokens( @@ -402,7 +402,7 @@ def unaligned_modify_tokens( id_offset = next_id(g) tokens = [ - Token(t, f"{side[0]}{(id_offset + i)}") for i, t in enumerate(token.tokenize(text)) + Token(t, f"{side[0]}{(id_offset + i)}") for i, t in enumerate(text_token.tokenize(text)) ] new_tokens, removed = lists.splice(g.get_side(side), from_, to - from_, *tokens) diff --git a/src/parallel_corpus/shared/dicts.py b/src/parallel_corpus/shared/dicts.py index 7c2c07b..7041264 100644 --- a/src/parallel_corpus/shared/dicts.py +++ b/src/parallel_corpus/shared/dicts.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: from _typeshed import SupportsRichComparison - K = TypeVar("K", bound=SupportsRichComparison) + K = TypeVar("K", bound=SupportsRichComparison) # type: ignore [syntax] else: K = TypeVar("K") diff --git a/src/parallel_corpus/token.py b/src/parallel_corpus/text_token.py similarity index 100% rename from src/parallel_corpus/token.py rename to src/parallel_corpus/text_token.py diff --git a/tests/test_graph.py b/tests/test_graph.py index 06a8b81..900b8c0 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -2,14 +2,14 @@ import pytest -from parallel_corpus import graph, token +from parallel_corpus import graph, text_token from parallel_corpus.source_target import Side, SourceTarget def test_graph_init() -> None: g = graph.init("w1 w2") - source = [token.Token(text="w1 ", id="s0"), token.Token(text="w2 ", id="s1")] - target = [token.Token(text="w1 ", id="t0"), token.Token(text="w2 ", id="t1")] + source = [text_token.Token(text="w1 ", id="s0"), text_token.Token(text="w2 ", id="s1")] + target = [text_token.Token(text="w1 ", id="t0"), text_token.Token(text="w2 ", id="t1")] edges = graph.edge_record([graph.edge(["s0", "t0"], []), graph.edge(["s1", "t1"], [])]) assert g.source == source @@ -24,8 +24,8 @@ def test_init_from_source_and_target_1() -> None: def test_init_from_source_and_target_2() -> None: g = graph.init_with_source_and_target(source="apa bepa", target="apa") - expected_source = token.identify(token.tokenize("apa bepa"), "s") - expected_target = token.identify(token.tokenize("apa"), "t") + expected_source = text_token.identify(text_token.tokenize("apa bepa"), "s") + expected_target = text_token.identify(text_token.tokenize("apa"), "t") g_expected = graph.Graph( source=expected_source, target=expected_target, @@ -36,8 +36,8 @@ def test_init_from_source_and_target_2() -> None: def test_init_from_source_and_target_3() -> None: g = graph.init_with_source_and_target(source="apa", target="bepa apa") - expected_source = token.identify(token.tokenize("apa"), "s") - expected_target = token.identify(token.tokenize("bepa apa"), "t") + expected_source = text_token.identify(text_token.tokenize("apa"), "s") + expected_target = text_token.identify(text_token.tokenize("bepa apa"), "t") g_expected = graph.Graph( source=expected_source, target=expected_target, @@ -95,19 +95,19 @@ def test_unaligned_set_side() -> None: print("<<< test_unaligned_set_side") expected_source = [ - token.Token(id="s0", text="a "), - token.Token(id="s1", text="bc "), - token.Token(id="s2", text="d "), + text_token.Token(id="s0", text="a "), + text_token.Token(id="s1", text="bc "), + text_token.Token(id="s2", text="d "), ] expected_g0_target = [ - token.Token(id="t0", text="a "), - token.Token(id="t1", text="bc "), - token.Token(id="t2", text="d "), + text_token.Token(id="t0", text="a "), + text_token.Token(id="t1", text="bc "), + text_token.Token(id="t2", text="d "), ] expected_g_target = [ - token.Token(id="t3", text="ab "), - token.Token(id="t4", text="c "), - token.Token(id="t5", text="d "), + text_token.Token(id="t3", text="ab "), + text_token.Token(id="t4", text="c "), + text_token.Token(id="t5", text="d "), ] expected_g_edges = { "e-s0-s1-s2-t3-t4-t5": graph.Edge( @@ -131,19 +131,19 @@ def test_graph_align() -> None: g = graph.unaligned_set_side(g0, Side.target, "ab c d") expected_source = [ - token.Token(id="s0", text="a "), - token.Token(id="s1", text="bc "), - token.Token(id="s2", text="d "), + text_token.Token(id="s0", text="a "), + text_token.Token(id="s1", text="bc "), + text_token.Token(id="s2", text="d "), ] expected_g0_target = [ - token.Token(id="t0", text="a "), - token.Token(id="t1", text="bc "), - token.Token(id="t2", text="d "), + text_token.Token(id="t0", text="a "), + text_token.Token(id="t1", text="bc "), + text_token.Token(id="t2", text="d "), ] expected_g_target = [ - token.Token(id="t3", text="ab "), - token.Token(id="t4", text="c "), - token.Token(id="t5", text="d "), + text_token.Token(id="t3", text="ab "), + text_token.Token(id="t4", text="c "), + text_token.Token(id="t5", text="d "), ] expected_g_edges = { "e-s0-s1-s2-t3-t4-t5": graph.Edge( diff --git a/tests/test_shared/test_lists.py b/tests/test_shared/test_lists.py index 797c2f4..1720aa6 100644 --- a/tests/test_shared/test_lists.py +++ b/tests/test_shared/test_lists.py @@ -2,14 +2,15 @@ def test_splice_1() -> None: - (*s_chars,) = "abcdef" + s_chars = ["a", "b", "c", "d", "e", "f"] ex, rm = lists.splice(s_chars, 3, 1, " ", "_") assert "".join(ex) == "abc _ef" assert "".join(rm) == "d" def test_splice_2() -> None: - (*s_chars,) = "abcdef" + s_chars = ["a", "b", "c", "d", "e", "f"] + (ex, rm) = lists.splice(s_chars, 3, 2, " ", "_") assert "".join(ex) == "abc _f" assert "".join(rm) == "de" diff --git a/tests/test_token.py b/tests/test_token.py index 1d9dd72..2479ffa 100644 --- a/tests/test_token.py +++ b/tests/test_token.py @@ -2,7 +2,7 @@ import pytest -from parallel_corpus.token import Token, identify, tokenize +from parallel_corpus.text_token import Token, identify, tokenize def test_can_create_token() -> None: