From 21944ac670bd0935c800e228dc383d066e13688e Mon Sep 17 00:00:00 2001 From: Kristoffer Andersson Date: Thu, 7 Nov 2024 12:46:58 +0100 Subject: [PATCH] fix: handle annotations when 2 tokens are merged Change to use output spans and corrections. Fixes #44 --- examples/christoph-borg/config.yaml | 21 +- .../ocr-correction-viklofg-sweocr/config.yaml | 19 +- examples/texts/dokument.txt | 3 +- .../annotations.py | 70 ++- .../ocr_corrector.py | 86 ++-- .../tests/__snapshots__/test_annotations.ambr | 14 + .../__snapshots__/test_ocr_suggestor.ambr | 480 +++++++++++++++--- .../tests/test_annotations.py | 20 + pdm.lock | 34 +- pyproject.toml | 1 + pyrightconfig.json | 3 + 11 files changed, 626 insertions(+), 125 deletions(-) create mode 100644 ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr create mode 100644 ocr-correction-viklofg-sweocr/tests/test_annotations.py create mode 100644 pyrightconfig.json diff --git a/examples/christoph-borg/config.yaml b/examples/christoph-borg/config.yaml index 5d9c306..fdacb3e 100644 --- a/examples/christoph-borg/config.yaml +++ b/examples/christoph-borg/config.yaml @@ -1,16 +1,17 @@ metadata: - id: christoph-borg - language: swe + id: christoph-borg + language: swe import: - importer: text_import:parse + importer: text_import:parse export: - annotations: - - - # - - - :stanza.pos - - :sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr - + annotations: + - + # - + - :stanza.pos + - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction + - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr + sparv: - compression: none + compression: none diff --git a/examples/ocr-correction-viklofg-sweocr/config.yaml b/examples/ocr-correction-viklofg-sweocr/config.yaml index 38e6d68..28f35c7 100644 --- a/examples/ocr-correction-viklofg-sweocr/config.yaml +++ b/examples/ocr-correction-viklofg-sweocr/config.yaml @@ -1,16 +1,17 @@ metadata: - id: hello-ocr - language: swe + id: hello-ocr + language: swe import: - importer: text_import:parse + importer: text_import:parse export: - annotations: - - - # - - - :stanza.pos - - :sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr + annotations: + - + # - + - :stanza.pos + - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction + - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr sparv: - compression: none + compression: none diff --git a/examples/texts/dokument.txt b/examples/texts/dokument.txt index 5d1bb8f..c5780a7 100644 --- a/examples/texts/dokument.txt +++ b/examples/texts/dokument.txt @@ -1 +1,2 @@ -Den i HandelstidniDgens g&rdagsnnmmer omtalade hvalfisken, sorn fångats i Frölnndaviken +Den i HandelstidniDgens g&rdagsnnmmer omtalade hvalfisken, sorn fångats i Frölnndaviken. +Jonath an saknades. diff --git a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py index ffa327a..304a13b 100644 --- a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py +++ b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py @@ -1,3 +1,5 @@ +from typing import List, Optional, Tuple + from sparv import api as sparv_api # type: ignore [import-untyped] from sparv.api import Annotation, Output, annotator # type: ignore [import-untyped] @@ -8,28 +10,74 @@ @annotator("OCR corrections as annotations", language=["swe"]) def annotate_ocr_correction( - out_ocr_correction: Output = Output( - ":sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr", - cls="sbx_ocr_correction_viklofg_sweocr", - description="OCR Corrections from viklfog/swedish-ocr (format: '|:|...|)", # noqa: E501 + out_ocr: Output = Output( + "sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction", cls="ocr_correction" + ), + out_ocr_corr: Output = Output( + "sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr", + cls="ocr_correction:correction", ), + # out_ocr_correction: Output = Output( + # ":sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr", + # cls="sbx_ocr_correction_viklofg_sweocr", + # description="OCR Corrections from viklfog/swedish-ocr (format: '|:|...|)", # noqa: E501 + # ), word: Annotation = Annotation(""), sentence: Annotation = Annotation(""), + token: Annotation = Annotation(""), ) -> None: ocr_corrector = OcrCorrector.default() sentences, _orphans = sentence.get_children(word) token_word = list(word.read()) - out_ocr_correction_annotation = word.create_empty_attribute() + # out_ocr_correction_annotation = word.create_empty_attribute() + + ocr_corrections = [] logger.progress(total=len(sentences)) # type: ignore - for sent in sentences: + for sent_idx in sentences: logger.progress() # type: ignore - sent_to_tag = [token_word[token_index] for token_index in sent] + sent = [token_word[token_index] for token_index in sent_idx] + + ocr_corrections.append(ocr_corrector.calculate_corrections(sent)) + # for i, ocr_correction in enumerate(ocr_corrections, start=sent[0]): + # out_ocr_correction_annotation[i] = ocr_correction + + # logger.info("writing annotations") + # out_ocr.write(ocr_spans) + # out_ocr_corr.write(ocr_corr_ann) + parse_ocr_corrections(sentences, token, ocr_corrections, out_ocr, out_ocr_corr) + + +def parse_ocr_corrections( + sentences: List, + token: Annotation, + ocr_corrections: List[List[Tuple[Tuple[int, int], Optional[str]]]], + out_ocr: Output, + out_ocr_corr: Output, +) -> None: + ocr_spans = [] + ocr_corr_ann = [] + + token_spans = list(token.read_spans()) + for sent, corr_sent in zip(sentences, ocr_corrections): + i = 0 + for span, corr_opt in corr_sent: + start_pos = token_spans[sent[i]][0] + + i += span[1] - span[0] - ocr_corrections = ocr_corrector.calculate_corrections(sent_to_tag) - for i, ocr_correction in enumerate(ocr_corrections, start=sent[0]): - out_ocr_correction_annotation[i] = ocr_correction + end_pos = token_spans[sent[i - 1]][1] + logger.debug( + "(%d, %d): '%s'", + start_pos, + end_pos, + "" if corr_opt is None else corr_opt, + ) + if corr_opt is not None: + ocr_spans.append((start_pos, end_pos)) + ocr_corr_ann.append(corr_opt) logger.info("writing annotations") - out_ocr_correction.write(out_ocr_correction_annotation) + out_ocr.write(ocr_spans) + out_ocr_corr.write(ocr_corr_ann) diff --git a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py index 141417d..acf753b 100644 --- a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py +++ b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py @@ -1,7 +1,8 @@ import re -from typing import List, Optional +from typing import List, Optional, Tuple from parallel_corpus import graph +from parallel_corpus.token import Token from sparv import api as sparv_api # type: ignore [import-untyped] from transformers import ( # type: ignore [import-untyped] AutoTokenizer, @@ -44,13 +45,15 @@ def default(cls) -> "OcrCorrector": ) return cls(model=model, tokenizer=tokenizer) - def calculate_corrections(self, text: List[str]) -> List[Optional[str]]: + def calculate_corrections( + self, text: List[str] + ) -> List[Tuple[Tuple[int, int], Optional[str]]]: logger.debug("Analyzing '%s'", text) parts: List[str] = [] curr_part: List[str] = [] curr_len = 0 - ocr_corrections: List[Optional[str]] = [] + ocr_corrections: List[Tuple[Tuple[int, int], Optional[str]]] = [] for word in text: len_word = bytes_length(word) if (curr_len + len_word + 1) > self.TEXT_LIMIT: @@ -61,66 +64,85 @@ def calculate_corrections(self, text: List[str]) -> List[Optional[str]]: curr_len = len_word if curr_len == 0 else curr_len + len_word + 1 if len(curr_part) > 0: parts.append(TOK_SEP.join(curr_part)) + curr_start = 0 for part in parts: - graph_initial = graph.init(part) suggested_text = self.pipeline(part)[0]["generated_text"] - - suggested_text = PUNCTUATION.sub(r" \0", suggested_text) - graph_aligned = graph.set_target(graph_initial, suggested_text) - ocr_corrections.extend(align_and_diff(graph_aligned)) + suggested_text = PUNCTUATION.sub(r" \g<0>", suggested_text) + graph_aligned = graph.init_with_source_and_target(part, suggested_text) + span_ann, curr_start = align_and_diff(graph_aligned, curr_start=curr_start) + ocr_corrections.extend(span_ann) logger.debug("Finished analyzing. ocr_corrections=%s", ocr_corrections) return ocr_corrections -def align_and_diff(g: graph.Graph) -> List[Optional[str]]: +def align_and_diff( + g: graph.Graph, *, curr_start: int +) -> Tuple[List[Tuple[Tuple[int, int], Optional[str]]], int]: corrections = [] edge_map = graph.edge_map(g) + visited_tokens = set() for s_token in g.source: + logger.debug("checking s_token=%s", s_token) edge = edge_map[s_token.id] source_ids = [id_ for id_ in edge.ids if id_.startswith("s")] target_ids = [id_ for id_ in edge.ids if id_.startswith("t")] + target_ids_str = "-".join(target_ids) + if target_ids_str in visited_tokens: + continue + visited_tokens.add(target_ids_str) + logger.debug("processing s_token=%s", s_token) + if len(source_ids) == len(target_ids): source_text = "".join( - lookup_text(g, s_id, graph.Side.source) for s_id in source_ids + lookup_text(g.source, s_id) for s_id in source_ids ).strip() target_text = "".join( - lookup_text(g, s_id, graph.Side.target) for s_id in target_ids + lookup_text(g.target, s_id) for s_id in target_ids ).strip() - corrections.append(target_text if source_text != target_text else None) + start = curr_start + curr_start += 1 + corrections.append( + ( + (start, curr_start), + target_text if source_text != target_text else None, + ) + ) elif len(source_ids) == 1: target_texts = " ".join( - lookup_text(g, id_, graph.Side.target).strip() for id_ in target_ids + lookup_text(g.target, id_).strip() for id_ in target_ids ) source_text = s_token.text.strip() - corrections.append(target_texts if source_text != target_texts else None) - elif len(target_ids) == 1: - # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44) - logger.warn( - f"Handle several sources, see https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44, {source_ids=} {target_ids=} {g.source=} {g.target=}" # noqa: E501 + start = curr_start + curr_start += 1 + + corrections.append( + ( + (start, curr_start), + target_texts if source_text != target_texts else None, + ), ) - target_text = lookup_text(g, target_ids[0], graph.Side.target).strip() - corrections.append(target_text) + elif len(target_ids) == 1: + target_text = lookup_text(g.target, target_ids[0]).strip() + start = curr_start + curr_start += len(source_ids) + corrections.append(((start, curr_start), target_text)) else: - # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44) + # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/50) raise NotImplementedError( f"Handle several sources, {source_ids=} {target_ids=} {g.source=} {g.target=}" # noqa: E501 ) - return corrections + return corrections, curr_start + +def lookup_text(tokens: List[Token], id_: str) -> str: + for token in tokens: + if token.id == id_: + return token.text -def lookup_text(g: graph.Graph, id_: str, side: graph.Side) -> str: - if side == graph.Side.source: - for token in g.source: - if token.id == id_: - return token.text - else: - for token in g.target: - if token.id == id_: - return token.text raise ValueError( - f"The id={id_} isn't found in the given graph on side={side}", + f"The id={id_} isn't found in the list of tokens", ) diff --git a/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr new file mode 100644 index 0000000..4f09e91 --- /dev/null +++ b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr @@ -0,0 +1,14 @@ +# serializer version: 1 +# name: test_annotate_ocr_correction + list([ + tuple( + 7, + 18, + ), + ]) +# --- +# name: test_annotate_ocr_correction.1 + list([ + 'ansaknades', + ]) +# --- diff --git a/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr index 7a8ae9a..dac26e0 100644 --- a/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr +++ b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr @@ -1,73 +1,433 @@ # serializer version: 1 # name: test_issue_40 list([ - 'Jonat han', - None, - '', - None, - None, - '', - None, - None, - None, - '', - None, - None, - None, - None, - None, - None, - None, - None, - '', + tuple( + tuple( + 0, + 1, + ), + 'Jonat han', + ), + tuple( + tuple( + 1, + 2, + ), + None, + ), + tuple( + tuple( + 2, + 3, + ), + None, + ), + tuple( + tuple( + 3, + 4, + ), + None, + ), + tuple( + tuple( + 4, + 5, + ), + None, + ), + tuple( + tuple( + 5, + 6, + ), + None, + ), + tuple( + tuple( + 6, + 7, + ), + None, + ), + tuple( + tuple( + 7, + 8, + ), + None, + ), + tuple( + tuple( + 8, + 9, + ), + None, + ), + tuple( + tuple( + 9, + 10, + ), + None, + ), + tuple( + tuple( + 10, + 11, + ), + None, + ), + tuple( + tuple( + 11, + 12, + ), + None, + ), + tuple( + tuple( + 12, + 13, + ), + None, + ), + tuple( + tuple( + 13, + 14, + ), + None, + ), + tuple( + tuple( + 14, + 15, + ), + None, + ), + tuple( + tuple( + 15, + 16, + ), + None, + ), + tuple( + tuple( + 16, + 17, + ), + None, + ), + tuple( + tuple( + 17, + 18, + ), + None, + ), + tuple( + tuple( + 18, + 19, + ), + None, + ), ]) # --- # name: test_long_text list([ - None, - None, - 'Riksgäldskontoret', - '', - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - '', - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - 'Riksdagsordningen', - None, - '', + tuple( + tuple( + 0, + 1, + ), + None, + ), + tuple( + tuple( + 1, + 2, + ), + None, + ), + tuple( + tuple( + 2, + 3, + ), + 'Riksgäldskontoret', + ), + tuple( + tuple( + 3, + 4, + ), + None, + ), + tuple( + tuple( + 4, + 5, + ), + None, + ), + tuple( + tuple( + 5, + 6, + ), + None, + ), + tuple( + tuple( + 6, + 7, + ), + None, + ), + tuple( + tuple( + 7, + 8, + ), + None, + ), + tuple( + tuple( + 8, + 9, + ), + None, + ), + tuple( + tuple( + 9, + 10, + ), + None, + ), + tuple( + tuple( + 10, + 11, + ), + None, + ), + tuple( + tuple( + 11, + 12, + ), + None, + ), + tuple( + tuple( + 12, + 13, + ), + None, + ), + tuple( + tuple( + 13, + 14, + ), + None, + ), + tuple( + tuple( + 14, + 15, + ), + None, + ), + tuple( + tuple( + 15, + 16, + ), + None, + ), + tuple( + tuple( + 16, + 17, + ), + None, + ), + tuple( + tuple( + 17, + 18, + ), + None, + ), + tuple( + tuple( + 18, + 19, + ), + None, + ), + tuple( + tuple( + 19, + 20, + ), + None, + ), + tuple( + tuple( + 20, + 21, + ), + None, + ), + tuple( + tuple( + 21, + 22, + ), + None, + ), + tuple( + tuple( + 22, + 23, + ), + None, + ), + tuple( + tuple( + 23, + 24, + ), + None, + ), + tuple( + tuple( + 24, + 25, + ), + None, + ), + tuple( + tuple( + 25, + 26, + ), + None, + ), + tuple( + tuple( + 26, + 27, + ), + 'Riksdagsordningen', + ), + tuple( + tuple( + 27, + 28, + ), + None, + ), + tuple( + tuple( + 28, + 29, + ), + '', + ), ]) # --- # name: test_short_text list([ - None, - None, - 'Handelstidningens', - 'gårdagsnummer', - None, - None, - '', - 'som', - None, - None, - 'Frölandsviken', - '', + tuple( + tuple( + 0, + 1, + ), + None, + ), + tuple( + tuple( + 1, + 2, + ), + None, + ), + tuple( + tuple( + 2, + 3, + ), + 'Handelstidningens', + ), + tuple( + tuple( + 3, + 4, + ), + 'gårdagsnummer', + ), + tuple( + tuple( + 4, + 5, + ), + None, + ), + tuple( + tuple( + 5, + 6, + ), + None, + ), + tuple( + tuple( + 6, + 7, + ), + None, + ), + tuple( + tuple( + 7, + 8, + ), + 'som', + ), + tuple( + tuple( + 8, + 9, + ), + None, + ), + tuple( + tuple( + 9, + 10, + ), + None, + ), + tuple( + tuple( + 10, + 11, + ), + 'Frölandsviken', + ), + tuple( + tuple( + 11, + 12, + ), + None, + ), ]) # --- diff --git a/ocr-correction-viklofg-sweocr/tests/test_annotations.py b/ocr-correction-viklofg-sweocr/tests/test_annotations.py new file mode 100644 index 0000000..50ca4ea --- /dev/null +++ b/ocr-correction-viklofg-sweocr/tests/test_annotations.py @@ -0,0 +1,20 @@ +from sbx_ocr_correction_viklofg_sweocr.annotations import annotate_ocr_correction +from sparv_pipeline_testing import MemoryOutput, MockAnnotation + + +def test_annotate_ocr_correction(snapshot) -> None: + output_ocr: MemoryOutput = MemoryOutput() + output_ocr_corr: MemoryOutput = MemoryOutput() + # "Jonath an saknades ." + # "12345678901234567890" + # " 1 2" + word = MockAnnotation(name="", values=["Jonath", "an", "saknades", "."]) + sentence = MockAnnotation( + name="", children={"": [[0, 1, 2, 3]]} + ) + token = MockAnnotation(name="", spans=[(0, 6), (7, 9), (10, 18), (19, 20)]) + + annotate_ocr_correction(output_ocr, output_ocr_corr, word, sentence, token) + + assert output_ocr.values == snapshot + assert output_ocr_corr.values == snapshot diff --git a/pdm.lock b/pdm.lock index df7c31c..563b0cd 100644 --- a/pdm.lock +++ b/pdm.lock @@ -4,8 +4,11 @@ [metadata] groups = ["default", "dev"] strategy = ["cross_platform", "inherit_metadata"] -lock_version = "4.4.1" -content_hash = "sha256:fb2039beb36d0e8c2bd2e41c1d68d9471b32cfa63a05084490fbe9018e9650fa" +lock_version = "4.5.0" +content_hash = "sha256:e402f3604a71fcf2e2fbec0fc4534703e51fdd7d599400893f8db559853a316c" + +[[metadata.targets]] +requires_python = ">=3.8.1,<3.12" [[package]] name = "annotated-types" @@ -48,6 +51,9 @@ version = "23.2.0" requires_python = ">=3.7" summary = "Classes Without Boilerplate" groups = ["dev"] +dependencies = [ + "importlib-metadata; python_version < \"3.8\"", +] files = [ {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"}, {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"}, @@ -230,6 +236,7 @@ summary = "Composable command line interface toolkit" groups = ["dev"] dependencies = [ "colorama; platform_system == \"Windows\"", + "importlib-metadata; python_version < \"3.8\"", ] files = [ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, @@ -555,6 +562,7 @@ summary = "GitPython is a Python library used to interact with Git repositories" groups = ["dev"] dependencies = [ "gitdb<5,>=4.0.1", + "typing-extensions>=3.7.4.3; python_version < \"3.8\"", ] files = [ {file = "GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff"}, @@ -588,7 +596,9 @@ requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" summary = "Human friendly output for text interfaces using Python" groups = ["dev"] dependencies = [ + "monotonic; python_version == \"2.7\"", "pyreadline3; sys_platform == \"win32\" and python_version >= \"3.8\"", + "pyreadline; sys_platform == \"win32\" and python_version < \"3.8\"", ] files = [ {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, @@ -613,6 +623,7 @@ requires_python = ">=3.8" summary = "Read metadata from Python packages" groups = ["dev"] dependencies = [ + "typing-extensions>=3.6.4; python_version < \"3.8\"", "zipp>=0.5", ] files = [ @@ -1169,6 +1180,7 @@ groups = ["dev"] dependencies = [ "charset-normalizer>=2.0.0", "cryptography>=36.0.0", + "typing-extensions; python_version < \"3.8\"", ] files = [ {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, @@ -1829,6 +1841,7 @@ summary = "Format click help output nicely with rich" groups = ["dev"] dependencies = [ "click>=7", + "importlib-metadata; python_version < \"3.8\"", "rich>=10.7", "typing-extensions", ] @@ -2157,6 +2170,19 @@ files = [ {file = "sparv_pipeline-5.2.0.tar.gz", hash = "sha256:d8282fd97ebae048e40b1cd83e087fad455ab137e107ebf54e2685b3af2f13b7"}, ] +[[package]] +name = "sparv-pipeline-testing" +version = "0.1.7" +requires_python = ">=3.8.1,<3.12" +git = "https://github.com/spraakbanken/sparv-pipeline-testing.git" +ref = "v0.1.7" +revision = "dd72b2f202e1164f6d1a4d731a81e9341d54dc13" +summary = "Testing utilities for sparv-pipeline." +groups = ["dev"] +dependencies = [ + "sparv-pipeline>=5.2.0", +] + [[package]] name = "sparv-sbx-ocr-correction-viklofg-sweocr" version = "0.2.3" @@ -2521,6 +2547,7 @@ groups = ["dev"] dependencies = [ "mypy-extensions>=0.3.0", "typing-extensions>=3.7.4", + "typing>=3.7.4; python_version < \"3.5\"", ] files = [ {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, @@ -2557,6 +2584,9 @@ name = "wcwidth" version = "0.2.13" summary = "Measures the displayed width of unicode strings in a terminal" groups = ["dev"] +dependencies = [ + "backports-functools-lru-cache>=1.2.1; python_version < \"3.2\"", +] files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, diff --git a/pyproject.toml b/pyproject.toml index a14bb93..2e77e0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,4 +13,5 @@ dev = [ "ruff>=0.3.2", "bump-my-version>=0.19.0", "syrupy>=4.0.0", + "sparv-pipeline-testing @ git+https://github.com/spraakbanken/sparv-pipeline-testing.git@v0.1.7", ] diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 0000000..d88916d --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,3 @@ +{ + "venv": "./.venv" +}