From 27bbfcc3f08efacb9b5421379f5fb0ed9c6d6540 Mon Sep 17 00:00:00 2001 From: Kristoffer Andersson Date: Wed, 23 Oct 2024 09:02:40 +0200 Subject: [PATCH] fix: use unambigous regex replacement --- .../ocr_corrector.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py index 141417d..0fe5e16 100644 --- a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py +++ b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py @@ -62,11 +62,17 @@ def calculate_corrections(self, text: List[str]) -> List[Optional[str]]: if len(curr_part) > 0: parts.append(TOK_SEP.join(curr_part)) for part in parts: - graph_initial = graph.init(part) - suggested_text = self.pipeline(part)[0]["generated_text"] - - suggested_text = PUNCTUATION.sub(r" \0", suggested_text) - graph_aligned = graph.set_target(graph_initial, suggested_text) + # graph_initial = graph.init(part) + print(f"{part=}") + raw_data = self.pipeline(part) + print(f"{raw_data=}") + suggested_text = raw_data[0]["generated_text"] + print(f"{suggested_text=}") + + suggested_text = PUNCTUATION.sub(r" \g<0>", suggested_text) + print(f"{suggested_text=}") + # graph_aligned = graph.set_target(graph_initial, suggested_text) + graph_aligned = graph.init_with_source_and_target(part, suggested_text) ocr_corrections.extend(align_and_diff(graph_aligned)) logger.debug("Finished analyzing. ocr_corrections=%s", ocr_corrections) @@ -74,7 +80,7 @@ def calculate_corrections(self, text: List[str]) -> List[Optional[str]]: def align_and_diff(g: graph.Graph) -> List[Optional[str]]: - corrections = [] + corrections: List[Optional[str]] = [] edge_map = graph.edge_map(g) for s_token in g.source: edge = edge_map[s_token.id] @@ -98,7 +104,7 @@ def align_and_diff(g: graph.Graph) -> List[Optional[str]]: corrections.append(target_texts if source_text != target_texts else None) elif len(target_ids) == 1: # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44) - logger.warn( + logger.warning( f"Handle several sources, see https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44, {source_ids=} {target_ids=} {g.source=} {g.target=}" # noqa: E501 ) target_text = lookup_text(g, target_ids[0], graph.Side.target).strip()