Skip to content

Commit

Permalink
fix: use unambigous regex replacement
Browse files Browse the repository at this point in the history
  • Loading branch information
kod-kristoff committed Oct 23, 2024
1 parent b6d0e1b commit 27bbfcc
Showing 1 changed file with 13 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,25 @@ def calculate_corrections(self, text: List[str]) -> List[Optional[str]]:
if len(curr_part) > 0:
parts.append(TOK_SEP.join(curr_part))
for part in parts:
graph_initial = graph.init(part)
suggested_text = self.pipeline(part)[0]["generated_text"]

suggested_text = PUNCTUATION.sub(r" \0", suggested_text)
graph_aligned = graph.set_target(graph_initial, suggested_text)
# graph_initial = graph.init(part)
print(f"{part=}")
raw_data = self.pipeline(part)
print(f"{raw_data=}")
suggested_text = raw_data[0]["generated_text"]
print(f"{suggested_text=}")

suggested_text = PUNCTUATION.sub(r" \g<0>", suggested_text)
print(f"{suggested_text=}")
# graph_aligned = graph.set_target(graph_initial, suggested_text)
graph_aligned = graph.init_with_source_and_target(part, suggested_text)
ocr_corrections.extend(align_and_diff(graph_aligned))

logger.debug("Finished analyzing. ocr_corrections=%s", ocr_corrections)
return ocr_corrections


def align_and_diff(g: graph.Graph) -> List[Optional[str]]:
corrections = []
corrections: List[Optional[str]] = []
edge_map = graph.edge_map(g)
for s_token in g.source:
edge = edge_map[s_token.id]
Expand All @@ -98,7 +104,7 @@ def align_and_diff(g: graph.Graph) -> List[Optional[str]]:
corrections.append(target_texts if source_text != target_texts else None)
elif len(target_ids) == 1:
# TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44)
logger.warn(
logger.warning(
f"Handle several sources, see https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44, {source_ids=} {target_ids=} {g.source=} {g.target=}" # noqa: E501
)
target_text = lookup_text(g, target_ids[0], graph.Side.target).strip()
Expand Down

0 comments on commit 27bbfcc

Please sign in to comment.