Skip to content

Commit

Permalink
refactor: fix some lint errors
Browse files Browse the repository at this point in the history
  • Loading branch information
kod-kristoff committed Oct 22, 2024
1 parent f73b2e1 commit 056b7d6
Show file tree
Hide file tree
Showing 20 changed files with 232 additions and 158 deletions.
90 changes: 64 additions & 26 deletions ruff.toml
Original file line number Diff line number Diff line change
@@ -1,36 +1,74 @@
line-length = 97

target-version = "py38"

[lint]
# Enable flake8-bugbear (`B`) rules.
select = [
"A",
# "ANN",
"B",
"BLE",
"C4",
"C90",
# "D",
"E",
"F",
"FBT",
"I",
"RUF",
"S",
"YTT",
"A", # flake8-builtins
"ANN", # flake8-annotations
"ARG", # flake8-unused-arguments
"B", # flake8-bugbear
"C4", # flake8-comprehensions
"COM", # flake8-commas
"D", # pydocstyle
"D400", # pydocstyle: ends-in-period
"D401", # pydocstyle: non-imperative-mood
"E", # pycodestyle: errors
"F", # Pyflakes
"FLY", # flynt
"FURB", # refurb
"G", # flake8-logging-format
"I", # isort
"ISC", # flake8-implicit-str-concat
"N", # pep8-naming
"PERF", # Perflint
"PIE", # flake8-pie
"PL", # Pylint
"PT", # flake8-pytest-style
"PTH", # flake8-use-pathlib
"Q", # flake8-quotes
"RET", # flake8-return
"RSE", # flake8-raise
"RUF", # Ruff-specific rules
"SIM", # flake8-simplify
"T20", # flake8-print
"TID", # flake8-tidy-imports
"UP", # pyupgrade
"W", # pycodestyle: warnings
]
ignore = [
"ANN101", # flake8-annotations: missing-type-self (deprecated)
"ANN102", # flake8-annotations: missing-type-cls (deprecated)
"ANN401", # flake8-annotations: any-type
"B008", # flake8-bugbear: function-call-in-default-argument
"COM812", # flake8-commas: missing-trailing-comma
"E741", # pycodestyle: ambiguous-variable-name
"PLR09", # Pylint: too-many-*
"PLR1702", # Pylint: too-many-nested-blocks
"SIM105", # flake8-simplify: suppressible-exception
]
preview = true

# Never enforce `E501` (line length violations).
# ignore = ["E501"]
ignore = ["ANN101", "ANN102", "D203", "D213"]
[lint.per-file-ignores]
"__init__.py" = ["F401"]
"tests/*.py" = [
"D100",
"D101",
"D102",
"D103",
"D104",
"S101",
"ANN001",
"PLR2004",
"N806",
"T201",
]

[lint.pydocstyle]
convention = "google"

# Avoid trying to fix flake8-bugbear (`B`) violations.
unfixable = ["B"]
# # Never enforce `E501` (line length violations).
# # ignore = ["E501"]
# ignore = ["ANN101", "ANN102", "D203", "D213"]

# Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`.
[lint.per-file-ignores]
"tests/*.py" = ["D100", "D101", "D102", "D103", "D104", "S101"]

# "__init__.py" = ["E402"]
# # Avoid trying to fix flake8-bugbear (`B`) violations.
# unfixable = ["B"]
1 change: 1 addition & 0 deletions src/parallel_corpus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Parallel corpus as a graph."""
71 changes: 34 additions & 37 deletions src/parallel_corpus/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


@dataclass
class Edge:
class Edge: # noqa: D101
# a copy of the identifier used in the edges object of the graph
id: str
# these are ids to source and target tokens
Expand All @@ -42,26 +42,26 @@ class Edge:


@dataclass
class Graph(SourceTarget[List[Token]]):
class Graph(SourceTarget[List[Token]]): # noqa: D101
edges: Edges
comment: Optional[str] = None

def copy_with_updated_side_and_edges(
def copy_with_updated_side_and_edges( # noqa: D102
self, side: Side, new_tokens: List[Token], edges: Edges
) -> "Graph":
source = self.source if side == Side.target else new_tokens
target = new_tokens if side == Side.target else self.target
return Graph(source=source, target=target, edges=edges, comment=self.comment)

def copy_with_edges(self, edges: Edges) -> "Graph":
def copy_with_edges(self, edges: Edges) -> "Graph": # noqa: D102
return Graph(source=self.source, target=self.target, edges=edges, comment=self.comment)


def next_id(g: Graph) -> int:
def next_id(g: Graph) -> int: # noqa: D103
return ids.next_id(itertools.chain((t.id for t in g.target), (s.id for s in g.source)))


def edge(
def edge( # noqa: D103
ids: List[str],
labels: List[str],
*,
Expand All @@ -79,21 +79,21 @@ def edge(
)


def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]:
def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]: # noqa: D103
return {e.id: e for e in es}


def init(s: str, *, manual: bool = False) -> Graph:
def init(s: str, *, manual: bool = False) -> Graph: # noqa: D103
return init_from(token.tokenize(s), manual=manual)


def init_with_source_and_target(source: str, target: str, *, manual: bool = False) -> Graph:
def init_with_source_and_target(source: str, target: str, *, manual: bool = False) -> Graph: # noqa: D103
return init_from_source_and_target(
source=token.tokenize(source), target=token.tokenize(target), manual=manual
)


def init_from(tokens: List[str], *, manual: bool = False) -> Graph:
def init_from(tokens: List[str], *, manual: bool = False) -> Graph: # noqa: D103
return align(
Graph(
source=token.identify(tokens, "s"),
Expand All @@ -105,7 +105,7 @@ def init_from(tokens: List[str], *, manual: bool = False) -> Graph:
)


def init_from_source_and_target(
def init_from_source_and_target( # noqa: D103
source: List[str], target: List[str], *, manual: bool = False
) -> Graph:
source_tokens = token.identify(source, "s")
Expand All @@ -124,13 +124,13 @@ def init_from_source_and_target(
)


class TextLabels(TypedDict):
class TextLabels(TypedDict): # noqa: D101
text: str
labels: List[str]


def from_unaligned(st: SourceTarget[List[TextLabels]]) -> Graph:
"""Initialize a graph from unaligned tokens"""
"""Initialize a graph from unaligned tokens."""
edges: Dict[str, Edge] = {}

def proto_token_to_token(tok: TextLabels, i: int, prefix: str) -> Token:
Expand All @@ -150,19 +150,19 @@ def proto_tokens_to_tokens(toks: List[TextLabels], side: Side) -> List[Token]:
return align(Graph(source=g.source, target=g.target, edges=edges))


def modify(g: Graph, from_: int, to: int, text: str, side: Side = Side.target) -> Graph:
def modify(g: Graph, from_: int, to: int, text: str, side: Side = Side.target) -> Graph: # noqa: D103
return align(unaligned_modify(g, from_, to, text, side))


def set_source(g: Graph, text: str) -> Graph:
def set_source(g: Graph, text: str) -> Graph: # noqa: D103
return align(unaligned_set_side(g, Side.source, text))


def set_target(g: Graph, text: str) -> Graph:
def set_target(g: Graph, text: str) -> Graph: # noqa: D103
return align(unaligned_set_side(g, Side.target, text))


def merge_edges(*es) -> Edge:
def merge_edges(*es) -> Edge: # noqa: ANN002, D103
ids = []
labels = []
manual = False
Expand All @@ -184,7 +184,7 @@ def merge_edges(*es) -> Edge:
zero_edge = merge_edges()


def align(g: Graph) -> Graph:
def align(g: Graph) -> Graph: # noqa: D103
# Use a union-find to group characters into edges.
uf = parallel_corpus.shared.union_find.poly_union_find(lambda u: u)
em = edge_map(g)
Expand All @@ -207,7 +207,7 @@ def align(g: Graph) -> Graph:
proto_edges = {k: e for k, e in g.edges.items() if e.manual}
first: UniqueCheck[str] = UniqueCheck()

def update_edges(tokens, _side):
def update_edges(tokens, _side) -> None: # noqa: ANN001
for tok in tokens:
e_repr = em[tok.id]
if not e_repr.manual:
Expand All @@ -225,29 +225,29 @@ def update_edges(tokens, _side):
return g.copy_with_edges(edges)


def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph:
def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: # noqa: D103
return align(unaligned_rearrange(g, begin, end, dest))


def target_text(g: SourceTarget[List[token.Text]]) -> str:
def target_text(g: SourceTarget[List[token.Text]]) -> str: # noqa: D103
return token.text(g.target)


@dataclass
class CharIdPair:
class CharIdPair: # noqa: D101
char: str
id: Optional[str] = None


def to_char_ids(token: Token) -> List[CharIdPair]:
def to_char_ids(token: Token) -> List[CharIdPair]: # noqa: D103
return parallel_corpus.shared.str_map.str_map(
token.text,
lambda char, _i: CharIdPair(char=char, id=None if char == " " else token.id),
)


def edge_map(g: Graph) -> Dict[str, Edge]:
"""Map from token ids to edges
"""Map from token ids to edges.
Args:
g (Graph): the Graph to build the edge map from.
Expand All @@ -262,7 +262,7 @@ def edge_map(g: Graph) -> Dict[str, Edge]:
return edges


def unaligned_set_side(g: Graph, side: Side, text: str) -> Graph:
def unaligned_set_side(g: Graph, side: Side, text: str) -> Graph: # noqa: D103
text0 = get_side_text(g, side)
edits = parallel_corpus.shared.ranges.edit_range(text0, text)

Expand Down Expand Up @@ -313,7 +313,6 @@ def unaligned_modify(
Indexes are character offsets (use CodeMirror's doc.posFromIndex and doc.indexFromPos to convert)
""" # noqa: E501

tokens = get_side_texts(g, side)
token_at = token.token_at(tokens, from_)
from_token, from_ix = token_at["token"], token_at["offset"]
Expand All @@ -326,15 +325,15 @@ def unaligned_modify(
return unaligned_modify_tokens(g, from_token, to_token + 1, pre + text + post, side)


def get_side_text(g: Graph, side: Side) -> str:
def get_side_text(g: Graph, side: Side) -> str: # noqa: D103
return token.text(g.get_side(side))


def get_side_texts(g: Graph, side: Side) -> List[str]:
def get_side_texts(g: Graph, side: Side) -> List[str]: # noqa: D103
return token.texts(g.get_side(side))


def unaligned_modify_tokens( # noqa: C901
def unaligned_modify_tokens(
g: Graph, from_: int, to: int, text: str, side: Side = Side.target
) -> Graph:
"""Replace the text at some position, merging the spans it touches upon.
Expand Down Expand Up @@ -366,7 +365,6 @@ def unaligned_modify_tokens( # noqa: C901
Indexes are token offsets
""" # noqa: E501

if (
from_ < 0
or to < 0
Expand All @@ -382,13 +380,12 @@ def unaligned_modify_tokens( # noqa: C901
return unaligned_modify_tokens(
g, from_ - 1, to, g.get_side(side)[from_ - 1].text + text, side
)
elif to < len(g.get_side(side)):
if to < len(g.get_side(side)):
return unaligned_modify_tokens(
g, from_, to + 1, text + g.get_side(side)[to].text, side
)

else:
logger.warn("Introducing whitespace into empty graph")
logger.warning("Introducing whitespace into empty graph")

if NO_WHITESPACE_AT_END.match(text[-1:]) is not None and to < len(g.get_side(side)):
# if replacement text does not end with whitespace, grab the next word as well
Expand Down Expand Up @@ -421,8 +418,7 @@ def fun(e: Edge, _id: str) -> bool:
for id_ in e.ids:
if id_ not in ids_removed:
new_edge_ids.add(id_)
for lbl in e.labels:
new_edge_labels.add(lbl)
new_edge_labels.update(e.labels)
return False
return True

Expand All @@ -436,11 +432,12 @@ def fun(e: Edge, _id: str) -> bool:


def unaligned_rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph:
"""Moves a slice of the target tokens and puts it at a new destination.
"""Move a slice of the target tokens and puts it at a new destination.
target_text(unaligned_rearrange(init('apa bepa cepa depa'), 1, 2, 0)) // => 'bepa cepa apa depa '
Indexes are token offsets""" # noqa: E501
Indexes are token offsets
""" # noqa: E501
em = edge_map(g)
edge_ids_to_update = {em[t.id].id for t in g.target[begin : (end + 1)]}
new_edges = {}
Expand Down
2 changes: 2 additions & 0 deletions src/parallel_corpus/shared/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Utilities."""

import re
from typing import List, TypeVar

Expand Down
8 changes: 5 additions & 3 deletions src/parallel_corpus/shared/dicts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Dicts."""

from typing import TYPE_CHECKING, Callable, Dict, List, TypeVar

if TYPE_CHECKING:
Expand All @@ -12,15 +14,15 @@
V = TypeVar("V")


def modify(x: Dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V:
def modify(x: Dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V: # noqa: D103
x[k] = f(x.get(k) or default)
return x[k]


def traverse(x: Dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> List[B]:
def traverse(x: Dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> List[B]: # noqa: D103
ks = sorted(x.keys()) if sort_keys else x.keys()
return [k(x[i], i) for i in ks]


def filter_dict(x: Dict[K, A], k: Callable[[A, K], bool]) -> Dict[K, A]:
def filter_dict(x: Dict[K, A], k: Callable[[A, K], bool]) -> Dict[K, A]: # noqa: D103
return {id_: a for id_, a in x.items() if k(a, id_)}
Loading

0 comments on commit 056b7d6

Please sign in to comment.