diff --git a/ruff.toml b/ruff.toml index 66c360d..4221071 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,36 +1,74 @@ line-length = 97 -target-version = "py38" - [lint] -# Enable flake8-bugbear (`B`) rules. select = [ - "A", - # "ANN", - "B", - "BLE", - "C4", - "C90", - # "D", - "E", - "F", - "FBT", - "I", - "RUF", - "S", - "YTT", + "A", # flake8-builtins + "ANN", # flake8-annotations + "ARG", # flake8-unused-arguments + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "COM", # flake8-commas + "D", # pydocstyle + "D400", # pydocstyle: ends-in-period + "D401", # pydocstyle: non-imperative-mood + "E", # pycodestyle: errors + "F", # Pyflakes + "FLY", # flynt + "FURB", # refurb + "G", # flake8-logging-format + "I", # isort + "ISC", # flake8-implicit-str-concat + "N", # pep8-naming + "PERF", # Perflint + "PIE", # flake8-pie + "PL", # Pylint + "PT", # flake8-pytest-style + "PTH", # flake8-use-pathlib + "Q", # flake8-quotes + "RET", # flake8-return + "RSE", # flake8-raise + "RUF", # Ruff-specific rules + "SIM", # flake8-simplify + "T20", # flake8-print + "TID", # flake8-tidy-imports + "UP", # pyupgrade + "W", # pycodestyle: warnings +] +ignore = [ + "ANN101", # flake8-annotations: missing-type-self (deprecated) + "ANN102", # flake8-annotations: missing-type-cls (deprecated) + "ANN401", # flake8-annotations: any-type + "B008", # flake8-bugbear: function-call-in-default-argument + "COM812", # flake8-commas: missing-trailing-comma + "E741", # pycodestyle: ambiguous-variable-name + "PLR09", # Pylint: too-many-* + "PLR1702", # Pylint: too-many-nested-blocks + "SIM105", # flake8-simplify: suppressible-exception ] +preview = true -# Never enforce `E501` (line length violations). -# ignore = ["E501"] -ignore = ["ANN101", "ANN102", "D203", "D213"] +[lint.per-file-ignores] +"__init__.py" = ["F401"] +"tests/*.py" = [ + "D100", + "D101", + "D102", + "D103", + "D104", + "S101", + "ANN001", + "PLR2004", + "N806", + "T201", +] +[lint.pydocstyle] +convention = "google" -# Avoid trying to fix flake8-bugbear (`B`) violations. -unfixable = ["B"] +# # Never enforce `E501` (line length violations). +# # ignore = ["E501"] +# ignore = ["ANN101", "ANN102", "D203", "D213"] -# Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`. -[lint.per-file-ignores] -"tests/*.py" = ["D100", "D101", "D102", "D103", "D104", "S101"] -# "__init__.py" = ["E402"] +# # Avoid trying to fix flake8-bugbear (`B`) violations. +# unfixable = ["B"] diff --git a/src/parallel_corpus/__init__.py b/src/parallel_corpus/__init__.py index e69de29..ce72650 100644 --- a/src/parallel_corpus/__init__.py +++ b/src/parallel_corpus/__init__.py @@ -0,0 +1 @@ +"""Parallel corpus as a graph.""" diff --git a/src/parallel_corpus/graph.py b/src/parallel_corpus/graph.py index b29ebeb..f083907 100644 --- a/src/parallel_corpus/graph.py +++ b/src/parallel_corpus/graph.py @@ -26,7 +26,7 @@ @dataclass -class Edge: +class Edge: # noqa: D101 # a copy of the identifier used in the edges object of the graph id: str # these are ids to source and target tokens @@ -42,26 +42,26 @@ class Edge: @dataclass -class Graph(SourceTarget[List[Token]]): +class Graph(SourceTarget[List[Token]]): # noqa: D101 edges: Edges comment: Optional[str] = None - def copy_with_updated_side_and_edges( + def copy_with_updated_side_and_edges( # noqa: D102 self, side: Side, new_tokens: List[Token], edges: Edges ) -> "Graph": source = self.source if side == Side.target else new_tokens target = new_tokens if side == Side.target else self.target return Graph(source=source, target=target, edges=edges, comment=self.comment) - def copy_with_edges(self, edges: Edges) -> "Graph": + def copy_with_edges(self, edges: Edges) -> "Graph": # noqa: D102 return Graph(source=self.source, target=self.target, edges=edges, comment=self.comment) -def next_id(g: Graph) -> int: +def next_id(g: Graph) -> int: # noqa: D103 return ids.next_id(itertools.chain((t.id for t in g.target), (s.id for s in g.source))) -def edge( +def edge( # noqa: D103 ids: List[str], labels: List[str], *, @@ -79,21 +79,21 @@ def edge( ) -def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]: +def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]: # noqa: D103 return {e.id: e for e in es} -def init(s: str, *, manual: bool = False) -> Graph: +def init(s: str, *, manual: bool = False) -> Graph: # noqa: D103 return init_from(token.tokenize(s), manual=manual) -def init_with_source_and_target(source: str, target: str, *, manual: bool = False) -> Graph: +def init_with_source_and_target(source: str, target: str, *, manual: bool = False) -> Graph: # noqa: D103 return init_from_source_and_target( source=token.tokenize(source), target=token.tokenize(target), manual=manual ) -def init_from(tokens: List[str], *, manual: bool = False) -> Graph: +def init_from(tokens: List[str], *, manual: bool = False) -> Graph: # noqa: D103 return align( Graph( source=token.identify(tokens, "s"), @@ -105,7 +105,7 @@ def init_from(tokens: List[str], *, manual: bool = False) -> Graph: ) -def init_from_source_and_target( +def init_from_source_and_target( # noqa: D103 source: List[str], target: List[str], *, manual: bool = False ) -> Graph: source_tokens = token.identify(source, "s") @@ -124,13 +124,13 @@ def init_from_source_and_target( ) -class TextLabels(TypedDict): +class TextLabels(TypedDict): # noqa: D101 text: str labels: List[str] def from_unaligned(st: SourceTarget[List[TextLabels]]) -> Graph: - """Initialize a graph from unaligned tokens""" + """Initialize a graph from unaligned tokens.""" edges: Dict[str, Edge] = {} def proto_token_to_token(tok: TextLabels, i: int, prefix: str) -> Token: @@ -150,19 +150,19 @@ def proto_tokens_to_tokens(toks: List[TextLabels], side: Side) -> List[Token]: return align(Graph(source=g.source, target=g.target, edges=edges)) -def modify(g: Graph, from_: int, to: int, text: str, side: Side = Side.target) -> Graph: +def modify(g: Graph, from_: int, to: int, text: str, side: Side = Side.target) -> Graph: # noqa: D103 return align(unaligned_modify(g, from_, to, text, side)) -def set_source(g: Graph, text: str) -> Graph: +def set_source(g: Graph, text: str) -> Graph: # noqa: D103 return align(unaligned_set_side(g, Side.source, text)) -def set_target(g: Graph, text: str) -> Graph: +def set_target(g: Graph, text: str) -> Graph: # noqa: D103 return align(unaligned_set_side(g, Side.target, text)) -def merge_edges(*es) -> Edge: +def merge_edges(*es) -> Edge: # noqa: ANN002, D103 ids = [] labels = [] manual = False @@ -184,7 +184,7 @@ def merge_edges(*es) -> Edge: zero_edge = merge_edges() -def align(g: Graph) -> Graph: +def align(g: Graph) -> Graph: # noqa: D103 # Use a union-find to group characters into edges. uf = parallel_corpus.shared.union_find.poly_union_find(lambda u: u) em = edge_map(g) @@ -207,7 +207,7 @@ def align(g: Graph) -> Graph: proto_edges = {k: e for k, e in g.edges.items() if e.manual} first: UniqueCheck[str] = UniqueCheck() - def update_edges(tokens, _side): + def update_edges(tokens, _side) -> None: # noqa: ANN001 for tok in tokens: e_repr = em[tok.id] if not e_repr.manual: @@ -225,21 +225,21 @@ def update_edges(tokens, _side): return g.copy_with_edges(edges) -def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: +def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: # noqa: D103 return align(unaligned_rearrange(g, begin, end, dest)) -def target_text(g: SourceTarget[List[token.Text]]) -> str: +def target_text(g: SourceTarget[List[token.Text]]) -> str: # noqa: D103 return token.text(g.target) @dataclass -class CharIdPair: +class CharIdPair: # noqa: D101 char: str id: Optional[str] = None -def to_char_ids(token: Token) -> List[CharIdPair]: +def to_char_ids(token: Token) -> List[CharIdPair]: # noqa: D103 return parallel_corpus.shared.str_map.str_map( token.text, lambda char, _i: CharIdPair(char=char, id=None if char == " " else token.id), @@ -247,7 +247,7 @@ def to_char_ids(token: Token) -> List[CharIdPair]: def edge_map(g: Graph) -> Dict[str, Edge]: - """Map from token ids to edges + """Map from token ids to edges. Args: g (Graph): the Graph to build the edge map from. @@ -262,7 +262,7 @@ def edge_map(g: Graph) -> Dict[str, Edge]: return edges -def unaligned_set_side(g: Graph, side: Side, text: str) -> Graph: +def unaligned_set_side(g: Graph, side: Side, text: str) -> Graph: # noqa: D103 text0 = get_side_text(g, side) edits = parallel_corpus.shared.ranges.edit_range(text0, text) @@ -313,7 +313,6 @@ def unaligned_modify( Indexes are character offsets (use CodeMirror's doc.posFromIndex and doc.indexFromPos to convert) """ # noqa: E501 - tokens = get_side_texts(g, side) token_at = token.token_at(tokens, from_) from_token, from_ix = token_at["token"], token_at["offset"] @@ -326,15 +325,15 @@ def unaligned_modify( return unaligned_modify_tokens(g, from_token, to_token + 1, pre + text + post, side) -def get_side_text(g: Graph, side: Side) -> str: +def get_side_text(g: Graph, side: Side) -> str: # noqa: D103 return token.text(g.get_side(side)) -def get_side_texts(g: Graph, side: Side) -> List[str]: +def get_side_texts(g: Graph, side: Side) -> List[str]: # noqa: D103 return token.texts(g.get_side(side)) -def unaligned_modify_tokens( # noqa: C901 +def unaligned_modify_tokens( g: Graph, from_: int, to: int, text: str, side: Side = Side.target ) -> Graph: """Replace the text at some position, merging the spans it touches upon. @@ -366,7 +365,6 @@ def unaligned_modify_tokens( # noqa: C901 Indexes are token offsets """ # noqa: E501 - if ( from_ < 0 or to < 0 @@ -382,13 +380,12 @@ def unaligned_modify_tokens( # noqa: C901 return unaligned_modify_tokens( g, from_ - 1, to, g.get_side(side)[from_ - 1].text + text, side ) - elif to < len(g.get_side(side)): + if to < len(g.get_side(side)): return unaligned_modify_tokens( g, from_, to + 1, text + g.get_side(side)[to].text, side ) - else: - logger.warn("Introducing whitespace into empty graph") + logger.warning("Introducing whitespace into empty graph") if NO_WHITESPACE_AT_END.match(text[-1:]) is not None and to < len(g.get_side(side)): # if replacement text does not end with whitespace, grab the next word as well @@ -421,8 +418,7 @@ def fun(e: Edge, _id: str) -> bool: for id_ in e.ids: if id_ not in ids_removed: new_edge_ids.add(id_) - for lbl in e.labels: - new_edge_labels.add(lbl) + new_edge_labels.update(e.labels) return False return True @@ -436,11 +432,12 @@ def fun(e: Edge, _id: str) -> bool: def unaligned_rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: - """Moves a slice of the target tokens and puts it at a new destination. + """Move a slice of the target tokens and puts it at a new destination. target_text(unaligned_rearrange(init('apa bepa cepa depa'), 1, 2, 0)) // => 'bepa cepa apa depa ' - Indexes are token offsets""" # noqa: E501 + Indexes are token offsets + """ # noqa: E501 em = edge_map(g) edge_ids_to_update = {em[t.id].id for t in g.target[begin : (end + 1)]} new_edges = {} diff --git a/src/parallel_corpus/shared/__init__.py b/src/parallel_corpus/shared/__init__.py index 04e5599..196f7e7 100644 --- a/src/parallel_corpus/shared/__init__.py +++ b/src/parallel_corpus/shared/__init__.py @@ -1,3 +1,5 @@ +"""Utilities.""" + import re from typing import List, TypeVar diff --git a/src/parallel_corpus/shared/dicts.py b/src/parallel_corpus/shared/dicts.py index 3176a48..7c2c07b 100644 --- a/src/parallel_corpus/shared/dicts.py +++ b/src/parallel_corpus/shared/dicts.py @@ -1,3 +1,5 @@ +"""Dicts.""" + from typing import TYPE_CHECKING, Callable, Dict, List, TypeVar if TYPE_CHECKING: @@ -12,15 +14,15 @@ V = TypeVar("V") -def modify(x: Dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V: +def modify(x: Dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V: # noqa: D103 x[k] = f(x.get(k) or default) return x[k] -def traverse(x: Dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> List[B]: +def traverse(x: Dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> List[B]: # noqa: D103 ks = sorted(x.keys()) if sort_keys else x.keys() return [k(x[i], i) for i in ks] -def filter_dict(x: Dict[K, A], k: Callable[[A, K], bool]) -> Dict[K, A]: +def filter_dict(x: Dict[K, A], k: Callable[[A, K], bool]) -> Dict[K, A]: # noqa: D103 return {id_: a for id_, a in x.items() if k(a, id_)} diff --git a/src/parallel_corpus/shared/diffs.py b/src/parallel_corpus/shared/diffs.py index 56d55d0..233e29c 100644 --- a/src/parallel_corpus/shared/diffs.py +++ b/src/parallel_corpus/shared/diffs.py @@ -1,5 +1,8 @@ +"""Diffs.""" + import enum -from typing import Callable, Dict, Generic, List, Optional, Tuple, TypeVar, Union +from itertools import starmap +from typing import Any, Callable, Dict, Generator, Generic, List, Optional, Tuple, TypeVar, Union import diff_match_patch as dmp_module from typing_extensions import Self @@ -13,14 +16,14 @@ C = TypeVar("C") -class ChangeType(enum.IntEnum): +class ChangeType(enum.IntEnum): # noqa: D101 DELETED = -1 CONSTANT = 0 INSERTED = 1 -class Change(Generic[A, B]): - def __init__(self, change: ChangeType, a: Optional[A] = None, b: Optional[B] = None): +class Change(Generic[A, B]): # noqa: D101 + def __init__(self, change: ChangeType, a: Optional[A] = None, b: Optional[B] = None) -> None: # noqa: D107 if change == ChangeType.DELETED and a is None: raise ValueError("`a` must be given for DELETED") if change == ChangeType.CONSTANT and (a is None or b is None): @@ -32,18 +35,18 @@ def __init__(self, change: ChangeType, a: Optional[A] = None, b: Optional[B] = N self.b = b @classmethod - def constant(cls, a: A, b: B) -> Self: + def constant(cls, a: A, b: B) -> Self: # noqa: D102 return cls(ChangeType.CONSTANT, a=a, b=b) @classmethod - def deleted(cls, a: A) -> Self: + def deleted(cls, a: A) -> Self: # noqa: D102 return cls(ChangeType.DELETED, a=a) @classmethod - def inserted(cls, b: B) -> Self: + def inserted(cls, b: B) -> Self: # noqa: D102 return cls(ChangeType.INSERTED, b=b) - def model_dump(self) -> Dict[str, Union[int, A, B]]: + def model_dump(self) -> Dict[str, Union[int, A, B]]: # noqa: D102 out: Dict[str, Union[int, A, B]] = { "change": int(self.change), } @@ -53,20 +56,23 @@ def model_dump(self) -> Dict[str, Union[int, A, B]]: out["b"] = self.b return out - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: # noqa: D105 if not isinstance(other, Change): return NotImplemented return self.change == other.change and self.a == other.a and self.b == other.b - def __repr__(self) -> str: + def __hash__(self) -> int: # noqa: D105 + return hash((self.change, self.a, self.b)) + + def __repr__(self) -> str: # noqa: D105 return f"Change(change={self.change!r},a={self.a!r},b={self.b!r})" - def __str__(self) -> str: + def __str__(self) -> str: # noqa: D105 return f"Change(change={self.change},a={self.a},b={self.b})" -def char_stream(): - """Make a stream of all unicode characters +def char_stream() -> Generator[str, None, None]: + """Make a stream of all unicode characters. We need this because the diff-match-patch library is hard-coded to work on characters. @@ -87,7 +93,7 @@ def char_stream(): i += 1 -def hdiff( # noqa: C901 +def hdiff( # noqa: D103 xs: List[A], ys: List[B], a_cmp: Callable[[A], str] = str, @@ -111,8 +117,8 @@ def assign(c: C, c_cmp: Callable[[C], str], c_from: Dict[str, List[C]]) -> str: arr.append(c) return u - s1 = "".join((assign(a, a_cmp, a_from) for a in xs)) - s2 = "".join((assign(b, b_cmp, b_from) for b in ys)) + s1 = "".join(assign(a, a_cmp, a_from) for a in xs) + s2 = "".join(assign(b, b_cmp, b_from) for b in ys) d = dmp.diff_main(s1, s2) def str_map_change(change: int) -> Callable[[str, int], Change]: @@ -131,17 +137,17 @@ def inner(c: str, _: int) -> Change: return inner - def map_change(change: int, cs): + def map_change(change: int, cs): # noqa: ANN001, ANN202 return str_map(cs, str_map_change(change)) out = [] - for changes in (map_change(change, cs) for change, cs in d): + for changes in starmap(map_change, d): # print(f"{changes=}") out.extend(changes) return out -def token_diff(s1: str, s2: str) -> List[Tuple[int, str]]: +def token_diff(s1: str, s2: str) -> List[Tuple[int, str]]: # noqa: D103 d = dmp.diff_main(s1, s2) dmp.diff_cleanupSemantic(d) return d diff --git a/src/parallel_corpus/shared/functional.py b/src/parallel_corpus/shared/functional.py index 50a9d94..062bcbd 100644 --- a/src/parallel_corpus/shared/functional.py +++ b/src/parallel_corpus/shared/functional.py @@ -1,9 +1,11 @@ +"""Functional utilities.""" + from typing import Callable, Sequence, TypeVar A = TypeVar("A") -def take_last_while(predicate: Callable[[A], bool], xs: Sequence[A]) -> Sequence[A]: +def take_last_while(predicate: Callable[[A], bool], xs: Sequence[A]) -> Sequence[A]: # noqa: D103 start = 0 for e in reversed(xs): if not predicate(e): diff --git a/src/parallel_corpus/shared/ids.py b/src/parallel_corpus/shared/ids.py index aa0f58a..f1b2510 100644 --- a/src/parallel_corpus/shared/ids.py +++ b/src/parallel_corpus/shared/ids.py @@ -1,3 +1,5 @@ +"""Ids.""" + import re from typing import Iterable @@ -5,7 +7,7 @@ def next_id(xs: Iterable[str]) -> int: - """Calculate the next id to use from these identifiers + """Calculate the next id to use from these identifiers. next_id([]) // => 0 next_id(['t1', 't2', 't3']) // => 4 diff --git a/src/parallel_corpus/shared/lists.py b/src/parallel_corpus/shared/lists.py index ff44b20..a0e25e9 100644 --- a/src/parallel_corpus/shared/lists.py +++ b/src/parallel_corpus/shared/lists.py @@ -1,3 +1,5 @@ +"""List.""" + import copy from typing import List, Tuple, TypeVar @@ -5,13 +7,14 @@ def rearrange(xs: List[A], begin: int, end: int, dest: int) -> List[A]: - """Moves a slice of the items and puts back them at some destination. + """Move a slice of the items and puts back them at some destination. rearrange([0, 1, 2, 3], 1, 2, 0) // => [1, 2, 0, 3] rearrange([0, 1, 2, 3], 1, 2, 3) // => [0, 3, 1, 2] rearrange([0, 1, 2, 3], 1, 2, 1) // => [0, 1, 2, 3] - rearrange([0, 1, 2, 3], 1, 2, 2) // => [0, 1, 2, 3]""" + rearrange([0, 1, 2, 3], 1, 2, 2) // => [0, 1, 2, 3] + """ a, mid, z = split_at_3(xs, begin, end + 1) w = end - begin if dest > begin: @@ -20,7 +23,7 @@ def rearrange(xs: List[A], begin: int, end: int, dest: int) -> List[A]: return pre + mid + post -def splice(xs: List[A], start: int, count: int, *insert) -> Tuple[List[A], List[A]]: +def splice(xs: List[A], start: int, count: int, *insert) -> Tuple[List[A], List[A]]: # noqa: ANN002, D103 ys = copy.deepcopy(xs) zs = ys[start : (start + count)] ys[start : (start + count)] = insert @@ -28,16 +31,17 @@ def splice(xs: List[A], start: int, count: int, *insert) -> Tuple[List[A], List[ def split_at_3(xs: List[A], start: int, end: int) -> Tuple[List[A], List[A], List[A]]: - """Split an array into three pieces + """Split an array into three pieces. splitAt3('0123456'.split(''), 2, 4).map(xs => xs.join('')) // => ['01', '23', '456'] splitAt3('0123456'.split(''), 2, 2).map(xs => xs.join('')) // => ['01', '', '23456'] splitAt3('0123456'.split(''), 2, 9).map(xs => xs.join('')) // => ['01', '23456', ''] - splitAt3('0123456'.split(''), 0, 2).map(xs => xs.join('')) // => ['', '01', '23456']""" + splitAt3('0123456'.split(''), 0, 2).map(xs => xs.join('')) // => ['', '01', '23456'] + """ ab, c = split_at(xs, end) a, b = split_at(ab, start) return a, b, c -def split_at(xs: List[A], index: int) -> Tuple[List[A], List[A]]: +def split_at(xs: List[A], index: int) -> Tuple[List[A], List[A]]: # noqa: D103 return xs[:index], xs[index:] diff --git a/src/parallel_corpus/shared/ranges.py b/src/parallel_corpus/shared/ranges.py index 6945fb6..8040894 100644 --- a/src/parallel_corpus/shared/ranges.py +++ b/src/parallel_corpus/shared/ranges.py @@ -1,3 +1,5 @@ +"""Ranges.""" + import itertools from typing import TypedDict @@ -8,7 +10,8 @@ def edit_range(s0: str, s: str) -> EditRange: - """ + """Create an EditRange. + >>> edit_range('0123456789', '0189') {'from': 2, 'to': 8, 'insert': ''} @@ -39,8 +42,8 @@ def edit_range(s0: str, s: str) -> EditRange: patches = token_diff(s0, s) pre = list(itertools.takewhile(lambda i: i[0] == 0, patches)) post = take_last_while(lambda i: i[0] == 0, patches) - from_ = len("".join((i[1] for i in pre))) - postlen = len("".join((i[1] for i in post))) + from_ = len("".join(i[1] for i in pre)) + postlen = len("".join(i[1] for i in post)) to = len(s0) - postlen insert = s[from_ : (len(s) - (len(s0) - to))] return {"from": from_, "to": to, "insert": insert} diff --git a/src/parallel_corpus/shared/str_map.py b/src/parallel_corpus/shared/str_map.py index d5b68b4..784fabd 100644 --- a/src/parallel_corpus/shared/str_map.py +++ b/src/parallel_corpus/shared/str_map.py @@ -1,7 +1,9 @@ +"""str_map.""" + from typing import Callable, List, TypeVar A = TypeVar("A") -def str_map(s: str, f: Callable[[str, int], A]) -> List[A]: +def str_map(s: str, f: Callable[[str, int], A]) -> List[A]: # noqa: D103 return [f(s[i], i) for i in range(len(s))] diff --git a/src/parallel_corpus/shared/union_find.py b/src/parallel_corpus/shared/union_find.py index e201c5d..6cf01e1 100644 --- a/src/parallel_corpus/shared/union_find.py +++ b/src/parallel_corpus/shared/union_find.py @@ -1,3 +1,5 @@ +"""UnionFind.""" + import abc import functools import json @@ -10,11 +12,11 @@ class UnionFindOperations(abc.ABC, Generic[A]): - """Union-find data structure operations""" + """Union-find data structure operations.""" @abc.abstractmethod def find(self, x: A) -> A: - """What group does this belong to?""" + """Answers what group `x` belongs to.""" @abc.abstractmethod def union(self, x: A, y: A) -> A: @@ -25,11 +27,11 @@ def unions(self, xs: List[A]) -> None: """Make these belong to the same group.""" -class UnionFind(UnionFindOperations[int]): - def __init__(self, *, rev: Optional[List[int]] = None) -> None: +class UnionFind(UnionFindOperations[int]): # noqa: D101 + def __init__(self, *, rev: Optional[List[int]] = None) -> None: # noqa: D107 self._rev: List[int] = rev or [] - def find(self, x: int) -> int: + def find(self, x: int) -> int: # noqa: D102 while x >= len(self._rev): self._rev.append(None) # type: ignore [arg-type] if self._rev[x] is None: @@ -38,25 +40,25 @@ def find(self, x: int) -> int: self._rev[x] = self.find(self._rev[x]) # type: ignore [arg-type] return self._rev[x] # type: ignore [return-value] - def union(self, x: int, y: int) -> int: + def union(self, x: int, y: int) -> int: # noqa: D102 find_x = self.find(x) find_y = self.find(y) if find_x != find_y: self._rev[find_y] = find_x return find_x - def unions(self, xs: List[int]) -> None: + def unions(self, xs: List[int]) -> None: # noqa: D102 functools.reduce(self.union, xs, xs[0]) @dataclass -class Renumber(Generic[A]): +class Renumber(Generic[A]): # noqa: D101 bw: Dict[str, int] fw: Dict[int, A] i = 0 serialize: Callable[[A], str] - def num(self, a: A) -> int: + def num(self, a: A) -> int: # noqa: D102 s = self.serialize(a) if s not in self.bw: self.fw[self.i] = a @@ -64,19 +66,18 @@ def num(self, a: A) -> int: self.i += 1 return self.bw[s] - def un(self, n: int) -> Optional[A]: + def un(self, n: int) -> Optional[A]: # noqa: D102 return self.fw.get(n) @classmethod - def init(cls, serialize: Callable[[A], str] = json.dumps) -> Self: + def init(cls, serialize: Callable[[A], str] = json.dumps) -> Self: # noqa: D102 return cls(bw={}, fw={}, serialize=serialize) def renumber( serialize: Callable[[A], str] = json.dumps, ) -> Tuple[Callable[[int], Optional[A]], Callable[[A], int]]: - """ - Assign unique numbers to each distinct element + """Assign unique numbers to each distinct element. const {un, num} = Renumber() num('foo') // => 0 @@ -97,26 +98,26 @@ def renumber( @dataclass -class PolyUnionFind(Generic[A]): +class PolyUnionFind(Generic[A]): # noqa: D101 _uf: UnionFind _renum: Renumber[A] - def repr(self, x: A) -> int: + def repr(self, x: A) -> int: # noqa: D102 return self._uf.find(self._renum.num(x)) - def find(self, x: A) -> Optional[A]: + def find(self, x: A) -> Optional[A]: # noqa: D102 return self._renum.un(self._uf.find(self._renum.num(x))) - def union(self, x: A, y: A) -> Optional[A]: + def union(self, x: A, y: A) -> Optional[A]: # noqa: D102 return self._renum.un(self._uf.union(self._renum.num(x), self._renum.num(y))) - def unions(self, xs: List[A]) -> None: + def unions(self, xs: List[A]) -> None: # noqa: D102 num_xs_0 = self._renum.num(xs[0]) for x in xs[1:]: self._uf.union(num_xs_0, self._renum.num(x)) -def poly_union_find(serialize: Callable[[str], str]) -> PolyUnionFind: +def poly_union_find(serialize: Callable[[str], str]) -> PolyUnionFind[str]: # noqa: D103 renum = Renumber.init(serialize) uf = UnionFind() return PolyUnionFind(_uf=uf, _renum=renum) diff --git a/src/parallel_corpus/shared/unique_check.py b/src/parallel_corpus/shared/unique_check.py index be6b0d2..71e81e4 100644 --- a/src/parallel_corpus/shared/unique_check.py +++ b/src/parallel_corpus/shared/unique_check.py @@ -1,10 +1,13 @@ +"""UniqueCheck.""" + from typing import Dict, Generic, TypeVar S = TypeVar("S") class UniqueCheck(Generic[S]): - """ + """Check if values are unique. + >>> u = UniqueCheck() >>> u(1) True @@ -20,15 +23,16 @@ class UniqueCheck(Generic[S]): False """ - def __init__(self) -> None: + def __init__(self) -> None: # noqa: D107 self.c: Count[S] = Count() - def __call__(self, s: S) -> bool: + def __call__(self, s: S) -> bool: # noqa: D102 return self.c.inc(s) == 1 class Count(Generic[S]): - """ + """Counter that can be increased and queried. + >>> u = Count() >>> u.inc(1) 1 @@ -50,12 +54,12 @@ class Count(Generic[S]): 1 """ - def __init__(self) -> None: + def __init__(self) -> None: # noqa: D107 self.m: Dict[S, int] = {} - def get(self, s: S) -> int: + def get(self, s: S) -> int: # noqa: D102 return self.m.get(s) or 0 - def inc(self, s: S) -> int: + def inc(self, s: S) -> int: # noqa: D102 self.m[s] = self.get(s) + 1 return self.get(s) diff --git a/src/parallel_corpus/source_target.py b/src/parallel_corpus/source_target.py index f8c2dd2..c555431 100644 --- a/src/parallel_corpus/source_target.py +++ b/src/parallel_corpus/source_target.py @@ -1,3 +1,5 @@ +"""SourceTarget.""" + from dataclasses import dataclass from typing import Callable, Generic, TypeVar @@ -9,19 +11,19 @@ B = TypeVar("B") -class Side(strenum.StrEnum): +class Side(strenum.StrEnum): # noqa: D101 source = "source" target = "target" @dataclass -class SourceTarget(Generic[A]): +class SourceTarget(Generic[A]): # noqa: D101 source: A target: A - def get_side(self, side: Side) -> A: + def get_side(self, side: Side) -> A: # noqa: D102 return self.source if side == Side.source else self.target -def map_sides(g: SourceTarget[A], f: Callable[[A, Side], B]) -> SourceTarget[B]: +def map_sides(g: SourceTarget[A], f: Callable[[A, Side], B]) -> SourceTarget[B]: # noqa: D103 return SourceTarget(source=f(g.source, Side.source), target=f(g.target, Side.target)) diff --git a/src/parallel_corpus/token.py b/src/parallel_corpus/token.py index ff6e32c..9e10cb5 100644 --- a/src/parallel_corpus/token.py +++ b/src/parallel_corpus/token.py @@ -1,3 +1,5 @@ +"""Token.""" + import re from dataclasses import dataclass from typing import List, Sequence, TypedDict @@ -6,23 +8,23 @@ @dataclass -class Text: +class Text: # noqa: D101 text: str @dataclass -class Token(Text): +class Token(Text): # noqa: D101 id: str @dataclass -class Span: +class Span: # noqa: D101 begin: int end: int def text(ts: Sequence[Text]) -> str: - """The text in some tokens + """Return text from the given tokens as string. >>> text(identify(tokenize('apa bepa cepa '), '#')) 'apa bepa cepa ' @@ -32,7 +34,7 @@ def text(ts: Sequence[Text]) -> str: def texts(ts: Sequence[Text]) -> List[str]: - """The texts in some tokens + """Return text from the given tokens as list. >>> texts(identify(tokenize('apa bepa cepa '), '#')) ['apa ', 'bepa ', 'cepa '] @@ -50,17 +52,18 @@ def tokenize(s: str) -> List[str]: ) -def identify(toks: List[str], prefix: str) -> List[Token]: +def identify(toks: List[str], prefix: str) -> List[Token]: # noqa: D103 return [Token(text=text, id=f"{prefix}{i}") for i, text in enumerate(toks)] -class TokenAt(TypedDict): +class TokenAt(TypedDict): # noqa: D101 token: int offset: int def token_at(tokens: List[str], character_offset: int) -> TokenAt: - """ + """Return token at the given offset. + >>> abc = ['012', '3456', '789'] >>> token_at(abc, 0) {'token': 0, 'offset': 0} diff --git a/tests/test_graph.py b/tests/test_graph.py index 5103be0..06a8b81 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -1,6 +1,7 @@ from typing import List import pytest + from parallel_corpus import graph, token from parallel_corpus.source_target import Side, SourceTarget @@ -180,15 +181,15 @@ def show_source(g: graph.Graph) -> List[str]: def ids(g: graph.Graph) -> str: - return " ".join((t.id for t in g.target)) + return " ".join(t.id for t in g.target) def ids_source(g: graph.Graph) -> str: - return " ".join((s.id for s in g.source)) + return " ".join(s.id for s in g.source) @pytest.mark.parametrize( - "i0, i1, word", + ("i0", "i1", "word"), [ (0, 0, "new"), (0, 1, "new"), @@ -202,7 +203,7 @@ def ids_source(g: graph.Graph) -> str: (16, 16, " !"), ], ) -def test_unaligned_modify(i0: int, i1: int, word: str, snapshot): +def test_unaligned_modify(i0: int, i1: int, word: str, snapshot) -> None: g = graph.init("test graph hello") assert g is not None assert show(graph.unaligned_modify(g, i0, i1, word)) == snapshot @@ -223,13 +224,13 @@ def test_unaligned_modify_tokens() -> None: assert ids(g) == "t0 t1 t2" -@pytest.mark.parametrize("text, expected", [("this", True), ("this ", False)]) +@pytest.mark.parametrize(("text", "expected"), [("this", True), ("this ", False)]) def test_no_whitespace_at_end(text: str, *, expected: bool) -> None: assert (graph.NO_WHITESPACE_AT_END.match(text[-1:]) is not None) is expected @pytest.mark.parametrize( - "from_, to, text", + ("from_", "to", "text"), [ (0, 0, "this "), (0, 1, "this "), @@ -250,7 +251,7 @@ def test_unaligned_modify_tokens_show(from_: int, to: int, text: str, snapshot) @pytest.mark.parametrize( - "from_, to, text", + ("from_", "to", "text"), [ (0, 0, "this "), (0, 1, "this "), @@ -263,7 +264,7 @@ def test_unaligned_modify_tokens_ids(from_: int, to: int, text: str, snapshot) - @pytest.mark.parametrize( - "from_, to, text", + ("from_", "to", "text"), [ (0, 0, "this "), ], @@ -276,7 +277,7 @@ def test_unaligned_modify_tokens_show_source(from_: int, to: int, text: str, sna @pytest.mark.parametrize( - "from_, to, text", + ("from_", "to", "text"), [ (0, 0, "this "), ], diff --git a/tests/test_shared/test_ids.py b/tests/test_shared/test_ids.py index 152683e..31cade1 100644 --- a/tests/test_shared/test_ids.py +++ b/tests/test_shared/test_ids.py @@ -1,7 +1,7 @@ from parallel_corpus.shared.ids import next_id -def test_next_id(): +def test_next_id() -> None: assert next_id([]) == 0 assert next_id(["t1", "t2", "t3"]) == 4 assert next_id(["u2v5k1", "b3", "a0"]) == 6 diff --git a/tests/test_shared/test_lists.py b/tests/test_shared/test_lists.py index 010742a..797c2f4 100644 --- a/tests/test_shared/test_lists.py +++ b/tests/test_shared/test_lists.py @@ -1,14 +1,14 @@ from parallel_corpus.shared import lists -def test_splice_1(): +def test_splice_1() -> None: (*s_chars,) = "abcdef" ex, rm = lists.splice(s_chars, 3, 1, " ", "_") assert "".join(ex) == "abc _ef" assert "".join(rm) == "d" -def test_splice_2(): +def test_splice_2() -> None: (*s_chars,) = "abcdef" (ex, rm) = lists.splice(s_chars, 3, 2, " ", "_") assert "".join(ex) == "abc _f" diff --git a/tests/test_shared/test_ranges.py b/tests/test_shared/test_ranges.py index ae16bbe..16a06df 100644 --- a/tests/test_shared/test_ranges.py +++ b/tests/test_shared/test_ranges.py @@ -1,20 +1,23 @@ +import string + import pytest + from parallel_corpus.shared.ranges import edit_range @pytest.mark.parametrize( - "s0, s", + ("s0", "s"), [ - ("0123456789", "0189"), - ("0123456789", "01"), - ("0123456789", "89"), - ("0123456789", ""), - ("0123456789", "01xyz89"), - ("0123456789", "01xyz"), - ("0123456789", "xyz89"), - ("0123456789", "xyz"), + (string.digits, "0189"), + (string.digits, "01"), + (string.digits, "89"), + (string.digits, ""), + (string.digits, "01xyz89"), + (string.digits, "01xyz"), + (string.digits, "xyz89"), + (string.digits, "xyz"), ("", "01"), ], ) -def test_edit_range(s0: str, s: str, snapshot): +def test_edit_range(s0: str, s: str, snapshot) -> None: assert edit_range(s0, s) == snapshot diff --git a/tests/test_token.py b/tests/test_token.py index 0c16f5f..1d9dd72 100644 --- a/tests/test_token.py +++ b/tests/test_token.py @@ -1,6 +1,7 @@ from typing import List import pytest + from parallel_corpus.token import Token, identify, tokenize @@ -12,7 +13,7 @@ def test_can_create_token() -> None: @pytest.mark.parametrize( - "text, expected", + ("text", "expected"), [ ("", []), (" ", [" "]),