diff --git a/mypy.ini b/mypy.ini index 18a6155..ea4d3cc 100644 --- a/mypy.ini +++ b/mypy.ini @@ -4,5 +4,5 @@ namespace_packages = True explicit_package_bases = True show_error_codes = True ignore_missing_imports = True -python_version = 3.8 +python_version = 3.9 ; plugins = adt.mypy_plugin diff --git a/ruff.toml b/ruff.toml index 4221071..b55b803 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,5 +1,7 @@ line-length = 97 +target-version = "py39" + [lint] select = [ "A", # flake8-builtins diff --git a/src/parallel_corpus/graph.py b/src/parallel_corpus/graph.py index 8b80a5b..081b50f 100644 --- a/src/parallel_corpus/graph.py +++ b/src/parallel_corpus/graph.py @@ -4,8 +4,9 @@ import itertools import logging import re +from collections.abc import Iterable from dataclasses import dataclass -from typing import Dict, Iterable, List, Optional, TypedDict, TypeVar +from typing import Optional, TypedDict, TypeVar import parallel_corpus.shared.ranges import parallel_corpus.shared.str_map @@ -33,24 +34,24 @@ class Edge: # a copy of the identifier used in the edges object of the graph id: str # these are ids to source and target tokens - ids: List[str] + ids: list[str] # labels on this edge - labels: List[str] + labels: list[str] # is this manually or automatically aligned manual: bool comment: Optional[str] = None -Edges = Dict[str, Edge] +Edges = dict[str, Edge] @dataclass -class Graph(SourceTarget[List[Token]]): # noqa: D101 +class Graph(SourceTarget[list[Token]]): # noqa: D101 edges: Edges comment: Optional[str] = None def copy_with_updated_side_and_edges( # noqa: D102 - self, side: Side, new_tokens: List[Token], edges: Edges + self, side: Side, new_tokens: list[Token], edges: Edges ) -> "Graph": source = self.source if side == Side.target else new_tokens target = new_tokens if side == Side.target else self.target @@ -65,8 +66,8 @@ def next_id(g: Graph) -> int: def edge( - ids: List[str], - labels: List[str], + ids: list[str], + labels: list[str], *, comment: Optional[str] = None, manual: bool = False, @@ -82,7 +83,7 @@ def edge( ) -def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]: +def edge_record(es: Iterable[Edge]) -> dict[str, Edge]: return {e.id: e for e in es} @@ -96,7 +97,7 @@ def init_with_source_and_target(source: str, target: str, *, manual: bool = Fals ) -def init_from(tokens: List[str], *, manual: bool = False) -> Graph: +def init_from(tokens: list[str], *, manual: bool = False) -> Graph: return align( Graph( source=text_token.identify(tokens, "s"), @@ -109,7 +110,7 @@ def init_from(tokens: List[str], *, manual: bool = False) -> Graph: def init_from_source_and_target( - source: List[str], target: List[str], *, manual: bool = False + source: list[str], target: list[str], *, manual: bool = False ) -> Graph: source_tokens = text_token.identify(source, "s") target_tokens = text_token.identify(target, "t") @@ -129,12 +130,12 @@ def init_from_source_and_target( class TextLabels(TypedDict): text: str - labels: List[str] + labels: list[str] -def from_unaligned(st: SourceTarget[List[TextLabels]]) -> Graph: +def from_unaligned(st: SourceTarget[list[TextLabels]]) -> Graph: """Initialize a graph from unaligned tokens.""" - edges: Dict[str, Edge] = {} + edges: dict[str, Edge] = {} def proto_token_to_token(tok: TextLabels, i: int, prefix: str) -> Token: id_ = f"{prefix}{i}" @@ -142,7 +143,7 @@ def proto_token_to_token(tok: TextLabels, i: int, prefix: str) -> Token: edges[id_] = e return Token(tok["text"], id_) - def proto_tokens_to_tokens(toks: List[TextLabels], side: Side) -> List[Token]: + def proto_tokens_to_tokens(toks: list[TextLabels], side: Side) -> list[Token]: return [ proto_token_to_token(tok, i, "s" if side == Side.source else "t") for i, tok in enumerate(toks) @@ -207,7 +208,7 @@ def align(g: Graph) -> Graph: c.a is not None and c.b is not None and c.a.id is not None and c.b.id is not None ): uf.union(c.a.id, c.b.id) - proto_edges: Dict[str, Edge] = {k: e for k, e in g.edges.items() if e.manual} + proto_edges: dict[str, Edge] = {k: e for k, e in g.edges.items() if e.manual} first: UniqueCheck[str] = UniqueCheck() def update_edges(tokens, _side) -> None: # noqa: ANN001 @@ -232,7 +233,7 @@ def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: return align(unaligned_rearrange(g, begin, end, dest)) -def target_text(g: SourceTarget[List[text_token.Text]]) -> str: +def target_text(g: SourceTarget[list[text_token.Text]]) -> str: return text_token.text(g.target) @@ -263,21 +264,21 @@ class CharIdPair: id: Optional[str] = None -def to_char_ids(token: Token) -> List[CharIdPair]: +def to_char_ids(token: Token) -> list[CharIdPair]: return parallel_corpus.shared.str_map.str_map( token.text, lambda char, _i: CharIdPair(char=char, id=None if char == " " else token.id), ) -def edge_map(g: Graph) -> Dict[str, Edge]: +def edge_map(g: Graph) -> dict[str, Edge]: """Map from token ids to edges. Args: g (Graph): the Graph to build the edge map from. Returns: - Dict[str, Edge]: a map from token ids to edges + dict[str, Edge]: a map from token ids to edges """ edges = {} for e in g.edges.values(): @@ -353,7 +354,7 @@ def get_side_text(g: Graph, side: Side) -> str: return text_token.text(g.get_side(side)) -def get_side_texts(g: Graph, side: Side) -> List[str]: +def get_side_texts(g: Graph, side: Side) -> list[str]: return text_token.texts(g.get_side(side)) diff --git a/src/parallel_corpus/shared/__init__.py b/src/parallel_corpus/shared/__init__.py index 196f7e7..71d74bd 100644 --- a/src/parallel_corpus/shared/__init__.py +++ b/src/parallel_corpus/shared/__init__.py @@ -1,7 +1,7 @@ """Utilities.""" import re -from typing import List, TypeVar +from typing import TypeVar from . import diffs @@ -17,8 +17,8 @@ def end_with_space(s: str) -> str: return f"{s} " if (ENDING_WHITESPACE.fullmatch(s[-1]) is None) else s -def uniq(xs: List[str]) -> List[str]: - used = set() +def uniq(xs: list[str]) -> list[str]: + used: set[str] = set() return [x for x in xs if x not in used and (used.add(x) or True)] # type: ignore [func-returns-value] diff --git a/src/parallel_corpus/shared/dicts.py b/src/parallel_corpus/shared/dicts.py index 7041264..048eaa6 100644 --- a/src/parallel_corpus/shared/dicts.py +++ b/src/parallel_corpus/shared/dicts.py @@ -1,6 +1,6 @@ -"""Dicts.""" +"""dicts.""" -from typing import TYPE_CHECKING, Callable, Dict, List, TypeVar +from typing import TYPE_CHECKING, Callable, TypeVar if TYPE_CHECKING: from _typeshed import SupportsRichComparison @@ -14,15 +14,15 @@ V = TypeVar("V") -def modify(x: Dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V: # noqa: D103 +def modify(x: dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V: # noqa: D103 x[k] = f(x.get(k) or default) return x[k] -def traverse(x: Dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> List[B]: # noqa: D103 +def traverse(x: dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> list[B]: # noqa: D103 ks = sorted(x.keys()) if sort_keys else x.keys() return [k(x[i], i) for i in ks] -def filter_dict(x: Dict[K, A], k: Callable[[A, K], bool]) -> Dict[K, A]: # noqa: D103 +def filter_dict(x: dict[K, A], k: Callable[[A, K], bool]) -> dict[K, A]: # noqa: D103 return {id_: a for id_, a in x.items() if k(a, id_)} diff --git a/src/parallel_corpus/shared/diffs.py b/src/parallel_corpus/shared/diffs.py index 233e29c..8af4b76 100644 --- a/src/parallel_corpus/shared/diffs.py +++ b/src/parallel_corpus/shared/diffs.py @@ -1,8 +1,9 @@ """Diffs.""" import enum +from collections.abc import Generator from itertools import starmap -from typing import Any, Callable, Dict, Generator, Generic, List, Optional, Tuple, TypeVar, Union +from typing import Any, Callable, Generic, Optional, TypeVar, Union import diff_match_patch as dmp_module from typing_extensions import Self @@ -46,8 +47,8 @@ def deleted(cls, a: A) -> Self: # noqa: D102 def inserted(cls, b: B) -> Self: # noqa: D102 return cls(ChangeType.INSERTED, b=b) - def model_dump(self) -> Dict[str, Union[int, A, B]]: # noqa: D102 - out: Dict[str, Union[int, A, B]] = { + def model_dump(self) -> dict[str, Union[int, A, B]]: # noqa: D102 + out: dict[str, Union[int, A, B]] = { "change": int(self.change), } if self.a is not None: @@ -94,17 +95,17 @@ def char_stream() -> Generator[str, None, None]: def hdiff( # noqa: D103 - xs: List[A], - ys: List[B], + xs: list[A], + ys: list[B], a_cmp: Callable[[A], str] = str, b_cmp: Callable[[B], str] = str, -) -> List[Change[A, B]]: - to: Dict[str, str] = {} - a_from: Dict[str, List[A]] = {} - b_from: Dict[str, List[B]] = {} +) -> list[Change[A, B]]: + to: dict[str, str] = {} + a_from: dict[str, list[A]] = {} + b_from: dict[str, list[B]] = {} chars = char_stream() - def assign(c: C, c_cmp: Callable[[C], str], c_from: Dict[str, List[C]]) -> str: + def assign(c: C, c_cmp: Callable[[C], str], c_from: dict[str, list[C]]) -> str: s = c_cmp(c) u = to.get(s) if u is None: @@ -147,7 +148,7 @@ def map_change(change: int, cs): # noqa: ANN001, ANN202 return out -def token_diff(s1: str, s2: str) -> List[Tuple[int, str]]: # noqa: D103 +def token_diff(s1: str, s2: str) -> list[tuple[int, str]]: # noqa: D103 d = dmp.diff_main(s1, s2) dmp.diff_cleanupSemantic(d) return d diff --git a/src/parallel_corpus/shared/functional.py b/src/parallel_corpus/shared/functional.py index 062bcbd..13867a7 100644 --- a/src/parallel_corpus/shared/functional.py +++ b/src/parallel_corpus/shared/functional.py @@ -1,6 +1,7 @@ """Functional utilities.""" -from typing import Callable, Sequence, TypeVar +from collections.abc import Sequence +from typing import Callable, TypeVar A = TypeVar("A") diff --git a/src/parallel_corpus/shared/ids.py b/src/parallel_corpus/shared/ids.py index f1b2510..bcd5701 100644 --- a/src/parallel_corpus/shared/ids.py +++ b/src/parallel_corpus/shared/ids.py @@ -1,7 +1,7 @@ """Ids.""" import re -from typing import Iterable +from collections.abc import Iterable DIGITS = re.compile(r"\d+") diff --git a/src/parallel_corpus/shared/lists.py b/src/parallel_corpus/shared/lists.py index a0e25e9..85fa629 100644 --- a/src/parallel_corpus/shared/lists.py +++ b/src/parallel_corpus/shared/lists.py @@ -1,12 +1,12 @@ -"""List.""" +"""list.""" import copy -from typing import List, Tuple, TypeVar +from typing import TypeVar A = TypeVar("A") -def rearrange(xs: List[A], begin: int, end: int, dest: int) -> List[A]: +def rearrange(xs: list[A], begin: int, end: int, dest: int) -> list[A]: """Move a slice of the items and puts back them at some destination. rearrange([0, 1, 2, 3], 1, 2, 0) // => [1, 2, 0, 3] @@ -23,14 +23,14 @@ def rearrange(xs: List[A], begin: int, end: int, dest: int) -> List[A]: return pre + mid + post -def splice(xs: List[A], start: int, count: int, *insert) -> Tuple[List[A], List[A]]: # noqa: ANN002, D103 +def splice(xs: list[A], start: int, count: int, *insert) -> tuple[list[A], list[A]]: # noqa: ANN002, D103 ys = copy.deepcopy(xs) zs = ys[start : (start + count)] ys[start : (start + count)] = insert return ys, zs -def split_at_3(xs: List[A], start: int, end: int) -> Tuple[List[A], List[A], List[A]]: +def split_at_3(xs: list[A], start: int, end: int) -> tuple[list[A], list[A], list[A]]: """Split an array into three pieces. splitAt3('0123456'.split(''), 2, 4).map(xs => xs.join('')) // => ['01', '23', '456'] @@ -43,5 +43,5 @@ def split_at_3(xs: List[A], start: int, end: int) -> Tuple[List[A], List[A], Lis return a, b, c -def split_at(xs: List[A], index: int) -> Tuple[List[A], List[A]]: # noqa: D103 +def split_at(xs: list[A], index: int) -> tuple[list[A], list[A]]: # noqa: D103 return xs[:index], xs[index:] diff --git a/src/parallel_corpus/shared/str_map.py b/src/parallel_corpus/shared/str_map.py index 784fabd..fe469fa 100644 --- a/src/parallel_corpus/shared/str_map.py +++ b/src/parallel_corpus/shared/str_map.py @@ -1,9 +1,9 @@ """str_map.""" -from typing import Callable, List, TypeVar +from typing import Callable, TypeVar A = TypeVar("A") -def str_map(s: str, f: Callable[[str, int], A]) -> List[A]: # noqa: D103 +def str_map(s: str, f: Callable[[str, int], A]) -> list[A]: # noqa: D103 return [f(s[i], i) for i in range(len(s))] diff --git a/src/parallel_corpus/shared/union_find.py b/src/parallel_corpus/shared/union_find.py index 6cf01e1..7a32a87 100644 --- a/src/parallel_corpus/shared/union_find.py +++ b/src/parallel_corpus/shared/union_find.py @@ -4,7 +4,7 @@ import functools import json from dataclasses import dataclass -from typing import Callable, Dict, Generic, List, Optional, Tuple, TypeVar +from typing import Callable, Generic, Optional, TypeVar from typing_extensions import Self @@ -23,13 +23,13 @@ def union(self, x: A, y: A) -> A: """Make these belong to the same group.""" @abc.abstractmethod - def unions(self, xs: List[A]) -> None: + def unions(self, xs: list[A]) -> None: """Make these belong to the same group.""" class UnionFind(UnionFindOperations[int]): # noqa: D101 - def __init__(self, *, rev: Optional[List[int]] = None) -> None: # noqa: D107 - self._rev: List[int] = rev or [] + def __init__(self, *, rev: Optional[list[Optional[int]]] = None) -> None: # noqa: D107 + self._rev: list[Optional[int]] = rev or [] def find(self, x: int) -> int: # noqa: D102 while x >= len(self._rev): @@ -47,14 +47,14 @@ def union(self, x: int, y: int) -> int: # noqa: D102 self._rev[find_y] = find_x return find_x - def unions(self, xs: List[int]) -> None: # noqa: D102 + def unions(self, xs: list[int]) -> None: # noqa: D102 functools.reduce(self.union, xs, xs[0]) @dataclass class Renumber(Generic[A]): # noqa: D101 - bw: Dict[str, int] - fw: Dict[int, A] + bw: dict[str, int] + fw: dict[int, A] i = 0 serialize: Callable[[A], str] @@ -76,7 +76,7 @@ def init(cls, serialize: Callable[[A], str] = json.dumps) -> Self: # noqa: D102 def renumber( serialize: Callable[[A], str] = json.dumps, -) -> Tuple[Callable[[int], Optional[A]], Callable[[A], int]]: +) -> tuple[Callable[[int], Optional[A]], Callable[[A], int]]: """Assign unique numbers to each distinct element. const {un, num} = Renumber() @@ -111,7 +111,7 @@ def find(self, x: A) -> Optional[A]: # noqa: D102 def union(self, x: A, y: A) -> Optional[A]: # noqa: D102 return self._renum.un(self._uf.union(self._renum.num(x), self._renum.num(y))) - def unions(self, xs: List[A]) -> None: # noqa: D102 + def unions(self, xs: list[A]) -> None: # noqa: D102 num_xs_0 = self._renum.num(xs[0]) for x in xs[1:]: self._uf.union(num_xs_0, self._renum.num(x)) diff --git a/src/parallel_corpus/shared/unique_check.py b/src/parallel_corpus/shared/unique_check.py index 71e81e4..47d8ccc 100644 --- a/src/parallel_corpus/shared/unique_check.py +++ b/src/parallel_corpus/shared/unique_check.py @@ -1,6 +1,6 @@ """UniqueCheck.""" -from typing import Dict, Generic, TypeVar +from typing import Generic, TypeVar S = TypeVar("S") @@ -55,7 +55,7 @@ class Count(Generic[S]): """ def __init__(self) -> None: # noqa: D107 - self.m: Dict[S, int] = {} + self.m: dict[S, int] = {} def get(self, s: S) -> int: # noqa: D102 return self.m.get(s) or 0 diff --git a/src/parallel_corpus/text_token.py b/src/parallel_corpus/text_token.py index 9e10cb5..bc6bc6e 100644 --- a/src/parallel_corpus/text_token.py +++ b/src/parallel_corpus/text_token.py @@ -1,8 +1,9 @@ """Token.""" import re +from collections.abc import Sequence from dataclasses import dataclass -from typing import List, Sequence, TypedDict +from typing import TypedDict from parallel_corpus import shared @@ -33,7 +34,7 @@ def text(ts: Sequence[Text]) -> str: return "".join(texts(ts)) -def texts(ts: Sequence[Text]) -> List[str]: +def texts(ts: Sequence[Text]) -> list[str]: """Return text from the given tokens as list. >>> texts(identify(tokenize('apa bepa cepa '), '#')) @@ -42,7 +43,7 @@ def texts(ts: Sequence[Text]) -> List[str]: return [t.text for t in ts] -def tokenize(s: str) -> List[str]: +def tokenize(s: str) -> list[str]: """Tokenizes text on whitespace, prefers to have trailing whitespace.""" return list( map( @@ -52,7 +53,7 @@ def tokenize(s: str) -> List[str]: ) -def identify(toks: List[str], prefix: str) -> List[Token]: # noqa: D103 +def identify(toks: list[str], prefix: str) -> list[Token]: # noqa: D103 return [Token(text=text, id=f"{prefix}{i}") for i, text in enumerate(toks)] @@ -61,7 +62,7 @@ class TokenAt(TypedDict): # noqa: D101 offset: int -def token_at(tokens: List[str], character_offset: int) -> TokenAt: +def token_at(tokens: list[str], character_offset: int) -> TokenAt: """Return token at the given offset. >>> abc = ['012', '3456', '789'] diff --git a/tests/test_graph.py b/tests/test_graph.py index a514034..5ebf4c8 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -1,5 +1,3 @@ -from typing import List - import pytest from parallel_corpus import graph, text_token @@ -172,11 +170,11 @@ def test_graph_align() -> None: assert len(g_aligned.edges) == 2 -def show(g: graph.Graph) -> List[str]: +def show(g: graph.Graph) -> list[str]: return [t.text for t in g.target] -def show_source(g: graph.Graph) -> List[str]: +def show_source(g: graph.Graph) -> list[str]: return [s.text for s in g.source] diff --git a/tests/test_token.py b/tests/test_token.py index 2479ffa..6e46bd2 100644 --- a/tests/test_token.py +++ b/tests/test_token.py @@ -1,5 +1,3 @@ -from typing import List - import pytest from parallel_corpus.text_token import Token, identify, tokenize @@ -23,7 +21,7 @@ def test_can_create_token() -> None: (" apa bepa cepa ", [" apa ", "bepa ", "cepa "]), ], ) -def test_tokenize(text: str, expected: List[str], snapshot) -> None: +def test_tokenize(text: str, expected: list[str], snapshot) -> None: actual = tokenize(text) assert actual == expected