From 21944ac670bd0935c800e228dc383d066e13688e Mon Sep 17 00:00:00 2001
From: Kristoffer Andersson <kristoffer.andersson@gu.se>
Date: Thu, 7 Nov 2024 12:46:58 +0100
Subject: [PATCH] fix: handle annotations when 2 tokens are merged

Change to use output spans and corrections.

Fixes #44
---
 examples/christoph-borg/config.yaml           |  21 +-
 .../ocr-correction-viklofg-sweocr/config.yaml |  19 +-
 examples/texts/dokument.txt                   |   3 +-
 .../annotations.py                            |  70 ++-
 .../ocr_corrector.py                          |  86 ++--
 .../tests/__snapshots__/test_annotations.ambr |  14 +
 .../__snapshots__/test_ocr_suggestor.ambr     | 480 +++++++++++++++---
 .../tests/test_annotations.py                 |  20 +
 pdm.lock                                      |  34 +-
 pyproject.toml                                |   1 +
 pyrightconfig.json                            |   3 +
 11 files changed, 626 insertions(+), 125 deletions(-)
 create mode 100644 ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr
 create mode 100644 ocr-correction-viklofg-sweocr/tests/test_annotations.py
 create mode 100644 pyrightconfig.json
diff --git a/examples/christoph-borg/config.yaml b/examples/christoph-borg/config.yaml
index 5d9c306..fdacb3e 100644
--- a/examples/christoph-borg/config.yaml
+++ b/examples/christoph-borg/config.yaml
@@ -1,16 +1,17 @@
 metadata:
-    id: christoph-borg
-    language: swe
+  id: christoph-borg
+  language: swe
 
 import:
-    importer: text_import:parse
+  importer: text_import:parse
 
 export:
-    annotations:
-        - <sentence>
-        # - <token:word>
-        - <token>:stanza.pos
-        - <token>:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr
-        
+  annotations:
+    - <sentence>
+    # - <token:word>
+    - <token>:stanza.pos
+    - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction
+    - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr
+
 sparv:
-    compression: none
+  compression: none
diff --git a/examples/ocr-correction-viklofg-sweocr/config.yaml b/examples/ocr-correction-viklofg-sweocr/config.yaml
index 38e6d68..28f35c7 100644
--- a/examples/ocr-correction-viklofg-sweocr/config.yaml
+++ b/examples/ocr-correction-viklofg-sweocr/config.yaml
@@ -1,16 +1,17 @@
 metadata:
-    id: hello-ocr
-    language: swe
+  id: hello-ocr
+  language: swe
 
 import:
-    importer: text_import:parse
+  importer: text_import:parse
 
 export:
-    annotations:
-        - <sentence>
-        # - <token:word>
-        - <token>:stanza.pos
-        - <token>:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr
+  annotations:
+    - <sentence>
+    # - <token:word>
+    - <token>:stanza.pos
+    - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction
+    - sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr
 
 sparv:
-    compression: none
+  compression: none
diff --git a/examples/texts/dokument.txt b/examples/texts/dokument.txt
index 5d1bb8f..c5780a7 100644
--- a/examples/texts/dokument.txt
+++ b/examples/texts/dokument.txt
@@ -1 +1,2 @@
-Den i HandelstidniDgens g&rdagsnnmmer omtalade hvalfisken, sorn fångats i Frölnndaviken
+Den i HandelstidniDgens g&rdagsnnmmer omtalade hvalfisken, sorn fångats i Frölnndaviken.
+Jonath an saknades.
diff --git a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py
index ffa327a..304a13b 100644
--- a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py
+++ b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/annotations.py
@@ -1,3 +1,5 @@
+from typing import List, Optional, Tuple
+
 from sparv import api as sparv_api  # type: ignore [import-untyped]
 from sparv.api import Annotation, Output, annotator  # type: ignore [import-untyped]
 
@@ -8,28 +10,74 @@
 
 @annotator("OCR corrections as annotations", language=["swe"])
 def annotate_ocr_correction(
-    out_ocr_correction: Output = Output(
-        "<token>:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr",
-        cls="sbx_ocr_correction_viklofg_sweocr",
-        description="OCR Corrections from viklfog/swedish-ocr (format: '|<word>:<score>|...|)",  # noqa: E501
+    out_ocr: Output = Output(
+        "sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction", cls="ocr_correction"
+    ),
+    out_ocr_corr: Output = Output(
+        "sbx_ocr_correction_viklofg_sweocr.sbx-ocr-correction:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr",
+        cls="ocr_correction:correction",
     ),
+    # out_ocr_correction: Output = Output(
+    #     "<token>:sbx_ocr_correction_viklofg_sweocr.ocr-correction--viklofg-sweocr",
+    #     cls="sbx_ocr_correction_viklofg_sweocr",
+    #     description="OCR Corrections from viklfog/swedish-ocr (format: '|<word>:<score>|...|)",  # noqa: E501
+    # ),
     word: Annotation = Annotation("<token:word>"),
     sentence: Annotation = Annotation("<sentence>"),
+    token: Annotation = Annotation("<token>"),
 ) -> None:
     ocr_corrector = OcrCorrector.default()
 
     sentences, _orphans = sentence.get_children(word)
     token_word = list(word.read())
-    out_ocr_correction_annotation = word.create_empty_attribute()
+    # out_ocr_correction_annotation = word.create_empty_attribute()
+
+    ocr_corrections = []
 
     logger.progress(total=len(sentences))  # type: ignore
-    for sent in sentences:
+    for sent_idx in sentences:
         logger.progress()  # type: ignore
-        sent_to_tag = [token_word[token_index] for token_index in sent]
+        sent = [token_word[token_index] for token_index in sent_idx]
+
+        ocr_corrections.append(ocr_corrector.calculate_corrections(sent))
+        # for i, ocr_correction in enumerate(ocr_corrections, start=sent[0]):
+        #     out_ocr_correction_annotation[i] = ocr_correction
+
+    # logger.info("writing annotations")
+    # out_ocr.write(ocr_spans)
+    # out_ocr_corr.write(ocr_corr_ann)
+    parse_ocr_corrections(sentences, token, ocr_corrections, out_ocr, out_ocr_corr)
+
+
+def parse_ocr_corrections(
+    sentences: List,
+    token: Annotation,
+    ocr_corrections: List[List[Tuple[Tuple[int, int], Optional[str]]]],
+    out_ocr: Output,
+    out_ocr_corr: Output,
+) -> None:
+    ocr_spans = []
+    ocr_corr_ann = []
+
+    token_spans = list(token.read_spans())
+    for sent, corr_sent in zip(sentences, ocr_corrections):
+        i = 0
+        for span, corr_opt in corr_sent:
+            start_pos = token_spans[sent[i]][0]
+
+            i += span[1] - span[0]
 
-        ocr_corrections = ocr_corrector.calculate_corrections(sent_to_tag)
-        for i, ocr_correction in enumerate(ocr_corrections, start=sent[0]):
-            out_ocr_correction_annotation[i] = ocr_correction
+            end_pos = token_spans[sent[i - 1]][1]
+            logger.debug(
+                "(%d, %d): '%s'",
+                start_pos,
+                end_pos,
+                "" if corr_opt is None else corr_opt,
+            )
+            if corr_opt is not None:
+                ocr_spans.append((start_pos, end_pos))
+                ocr_corr_ann.append(corr_opt)
 
     logger.info("writing annotations")
-    out_ocr_correction.write(out_ocr_correction_annotation)
+    out_ocr.write(ocr_spans)
+    out_ocr_corr.write(ocr_corr_ann)
diff --git a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py
index 141417d..acf753b 100644
--- a/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py
+++ b/ocr-correction-viklofg-sweocr/src/sbx_ocr_correction_viklofg_sweocr/ocr_corrector.py
@@ -1,7 +1,8 @@
 import re
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 from parallel_corpus import graph
+from parallel_corpus.token import Token
 from sparv import api as sparv_api  # type: ignore [import-untyped]
 from transformers import (  # type: ignore [import-untyped]
     AutoTokenizer,
@@ -44,13 +45,15 @@ def default(cls) -> "OcrCorrector":
         )
         return cls(model=model, tokenizer=tokenizer)
 
-    def calculate_corrections(self, text: List[str]) -> List[Optional[str]]:
+    def calculate_corrections(
+        self, text: List[str]
+    ) -> List[Tuple[Tuple[int, int], Optional[str]]]:
         logger.debug("Analyzing '%s'", text)
 
         parts: List[str] = []
         curr_part: List[str] = []
         curr_len = 0
-        ocr_corrections: List[Optional[str]] = []
+        ocr_corrections: List[Tuple[Tuple[int, int], Optional[str]]] = []
         for word in text:
             len_word = bytes_length(word)
             if (curr_len + len_word + 1) > self.TEXT_LIMIT:
@@ -61,66 +64,85 @@ def calculate_corrections(self, text: List[str]) -> List[Optional[str]]:
                 curr_len = len_word if curr_len == 0 else curr_len + len_word + 1
         if len(curr_part) > 0:
             parts.append(TOK_SEP.join(curr_part))
+        curr_start = 0
         for part in parts:
-            graph_initial = graph.init(part)
             suggested_text = self.pipeline(part)[0]["generated_text"]
-
-            suggested_text = PUNCTUATION.sub(r" \0", suggested_text)
-            graph_aligned = graph.set_target(graph_initial, suggested_text)
-            ocr_corrections.extend(align_and_diff(graph_aligned))
+            suggested_text = PUNCTUATION.sub(r" \g<0>", suggested_text)
+            graph_aligned = graph.init_with_source_and_target(part, suggested_text)
+            span_ann, curr_start = align_and_diff(graph_aligned, curr_start=curr_start)
+            ocr_corrections.extend(span_ann)
 
         logger.debug("Finished analyzing. ocr_corrections=%s", ocr_corrections)
         return ocr_corrections
 
 
-def align_and_diff(g: graph.Graph) -> List[Optional[str]]:
+def align_and_diff(
+    g: graph.Graph, *, curr_start: int
+) -> Tuple[List[Tuple[Tuple[int, int], Optional[str]]], int]:
     corrections = []
     edge_map = graph.edge_map(g)
+    visited_tokens = set()
     for s_token in g.source:
+        logger.debug("checking s_token=%s", s_token)
         edge = edge_map[s_token.id]
 
         source_ids = [id_ for id_ in edge.ids if id_.startswith("s")]
         target_ids = [id_ for id_ in edge.ids if id_.startswith("t")]
+        target_ids_str = "-".join(target_ids)
+        if target_ids_str in visited_tokens:
+            continue
+        visited_tokens.add(target_ids_str)
+        logger.debug("processing s_token=%s", s_token)
+
         if len(source_ids) == len(target_ids):
             source_text = "".join(
-                lookup_text(g, s_id, graph.Side.source) for s_id in source_ids
+                lookup_text(g.source, s_id) for s_id in source_ids
             ).strip()
             target_text = "".join(
-                lookup_text(g, s_id, graph.Side.target) for s_id in target_ids
+                lookup_text(g.target, s_id) for s_id in target_ids
             ).strip()
-            corrections.append(target_text if source_text != target_text else None)
+            start = curr_start
+            curr_start += 1
+            corrections.append(
+                (
+                    (start, curr_start),
+                    target_text if source_text != target_text else None,
+                )
+            )
 
         elif len(source_ids) == 1:
             target_texts = " ".join(
-                lookup_text(g, id_, graph.Side.target).strip() for id_ in target_ids
+                lookup_text(g.target, id_).strip() for id_ in target_ids
             )
             source_text = s_token.text.strip()
-            corrections.append(target_texts if source_text != target_texts else None)
-        elif len(target_ids) == 1:
-            # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44)
-            logger.warn(
-                f"Handle several sources, see https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44, {source_ids=} {target_ids=} {g.source=} {g.target=}"  # noqa: E501
+            start = curr_start
+            curr_start += 1
+
+            corrections.append(
+                (
+                    (start, curr_start),
+                    target_texts if source_text != target_texts else None,
+                ),
             )
-            target_text = lookup_text(g, target_ids[0], graph.Side.target).strip()
-            corrections.append(target_text)
+        elif len(target_ids) == 1:
+            target_text = lookup_text(g.target, target_ids[0]).strip()
+            start = curr_start
+            curr_start += len(source_ids)
+            corrections.append(((start, curr_start), target_text))
         else:
-            # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/44)
+            # TODO Handle this correct (https://github.com/spraakbanken/sparv-sbx-ocr-correction/issues/50)
             raise NotImplementedError(
                 f"Handle several sources, {source_ids=} {target_ids=} {g.source=} {g.target=}"  # noqa: E501
             )
 
-    return corrections
+    return corrections, curr_start
+
 
+def lookup_text(tokens: List[Token], id_: str) -> str:
+    for token in tokens:
+        if token.id == id_:
+            return token.text
 
-def lookup_text(g: graph.Graph, id_: str, side: graph.Side) -> str:
-    if side == graph.Side.source:
-        for token in g.source:
-            if token.id == id_:
-                return token.text
-    else:
-        for token in g.target:
-            if token.id == id_:
-                return token.text
     raise ValueError(
-        f"The id={id_} isn't found in the given graph on side={side}",
+        f"The id={id_} isn't found in the list of tokens",
     )
diff --git a/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr
new file mode 100644
index 0000000..4f09e91
--- /dev/null
+++ b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_annotations.ambr
@@ -0,0 +1,14 @@
+# serializer version: 1
+# name: test_annotate_ocr_correction
+  list([
+    tuple(
+      7,
+      18,
+    ),
+  ])
+# ---
+# name: test_annotate_ocr_correction.1
+  list([
+    'ansaknades',
+  ])
+# ---
diff --git a/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr
index 7a8ae9a..dac26e0 100644
--- a/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr
+++ b/ocr-correction-viklofg-sweocr/tests/__snapshots__/test_ocr_suggestor.ambr
@@ -1,73 +1,433 @@
 # serializer version: 1
 # name: test_issue_40
   list([
-    'Jonat han',
-    None,
-    '',
-    None,
-    None,
-    '',
-    None,
-    None,
-    None,
-    '',
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    '',
+    tuple(
+      tuple(
+        0,
+        1,
+      ),
+      'Jonat han',
+    ),
+    tuple(
+      tuple(
+        1,
+        2,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        2,
+        3,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        3,
+        4,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        4,
+        5,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        5,
+        6,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        6,
+        7,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        7,
+        8,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        8,
+        9,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        9,
+        10,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        10,
+        11,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        11,
+        12,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        12,
+        13,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        13,
+        14,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        14,
+        15,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        15,
+        16,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        16,
+        17,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        17,
+        18,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        18,
+        19,
+      ),
+      None,
+    ),
   ])
 # ---
 # name: test_long_text
   list([
-    None,
-    None,
-    'Riksgäldskontoret',
-    '',
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    '',
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    None,
-    'Riksdagsordningen',
-    None,
-    '',
+    tuple(
+      tuple(
+        0,
+        1,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        1,
+        2,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        2,
+        3,
+      ),
+      'Riksgäldskontoret',
+    ),
+    tuple(
+      tuple(
+        3,
+        4,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        4,
+        5,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        5,
+        6,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        6,
+        7,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        7,
+        8,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        8,
+        9,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        9,
+        10,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        10,
+        11,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        11,
+        12,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        12,
+        13,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        13,
+        14,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        14,
+        15,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        15,
+        16,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        16,
+        17,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        17,
+        18,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        18,
+        19,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        19,
+        20,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        20,
+        21,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        21,
+        22,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        22,
+        23,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        23,
+        24,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        24,
+        25,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        25,
+        26,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        26,
+        27,
+      ),
+      'Riksdagsordningen',
+    ),
+    tuple(
+      tuple(
+        27,
+        28,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        28,
+        29,
+      ),
+      '',
+    ),
   ])
 # ---
 # name: test_short_text
   list([
-    None,
-    None,
-    'Handelstidningens',
-    'gårdagsnummer',
-    None,
-    None,
-    '',
-    'som',
-    None,
-    None,
-    'Frölandsviken',
-    '',
+    tuple(
+      tuple(
+        0,
+        1,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        1,
+        2,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        2,
+        3,
+      ),
+      'Handelstidningens',
+    ),
+    tuple(
+      tuple(
+        3,
+        4,
+      ),
+      'gårdagsnummer',
+    ),
+    tuple(
+      tuple(
+        4,
+        5,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        5,
+        6,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        6,
+        7,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        7,
+        8,
+      ),
+      'som',
+    ),
+    tuple(
+      tuple(
+        8,
+        9,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        9,
+        10,
+      ),
+      None,
+    ),
+    tuple(
+      tuple(
+        10,
+        11,
+      ),
+      'Frölandsviken',
+    ),
+    tuple(
+      tuple(
+        11,
+        12,
+      ),
+      None,
+    ),
   ])
 # ---
diff --git a/ocr-correction-viklofg-sweocr/tests/test_annotations.py b/ocr-correction-viklofg-sweocr/tests/test_annotations.py
new file mode 100644
index 0000000..50ca4ea
--- /dev/null
+++ b/ocr-correction-viklofg-sweocr/tests/test_annotations.py
@@ -0,0 +1,20 @@
+from sbx_ocr_correction_viklofg_sweocr.annotations import annotate_ocr_correction
+from sparv_pipeline_testing import MemoryOutput, MockAnnotation
+
+
+def test_annotate_ocr_correction(snapshot) -> None:
+    output_ocr: MemoryOutput = MemoryOutput()
+    output_ocr_corr: MemoryOutput = MemoryOutput()
+    # "Jonath an saknades ."
+    # "12345678901234567890"
+    # "         1         2"
+    word = MockAnnotation(name="<token:word>", values=["Jonath", "an", "saknades", "."])
+    sentence = MockAnnotation(
+        name="<sentence>", children={"<token:word>": [[0, 1, 2, 3]]}
+    )
+    token = MockAnnotation(name="<token>", spans=[(0, 6), (7, 9), (10, 18), (19, 20)])
+
+    annotate_ocr_correction(output_ocr, output_ocr_corr, word, sentence, token)
+
+    assert output_ocr.values == snapshot
+    assert output_ocr_corr.values == snapshot
diff --git a/pdm.lock b/pdm.lock
index df7c31c..563b0cd 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -4,8 +4,11 @@
 [metadata]
 groups = ["default", "dev"]
 strategy = ["cross_platform", "inherit_metadata"]
-lock_version = "4.4.1"
-content_hash = "sha256:fb2039beb36d0e8c2bd2e41c1d68d9471b32cfa63a05084490fbe9018e9650fa"
+lock_version = "4.5.0"
+content_hash = "sha256:e402f3604a71fcf2e2fbec0fc4534703e51fdd7d599400893f8db559853a316c"
+
+[[metadata.targets]]
+requires_python = ">=3.8.1,<3.12"
 
 [[package]]
 name = "annotated-types"
@@ -48,6 +51,9 @@ version = "23.2.0"
 requires_python = ">=3.7"
 summary = "Classes Without Boilerplate"
 groups = ["dev"]
+dependencies = [
+    "importlib-metadata; python_version < \"3.8\"",
+]
 files = [
     {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"},
     {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
@@ -230,6 +236,7 @@ summary = "Composable command line interface toolkit"
 groups = ["dev"]
 dependencies = [
     "colorama; platform_system == \"Windows\"",
+    "importlib-metadata; python_version < \"3.8\"",
 ]
 files = [
     {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
@@ -555,6 +562,7 @@ summary = "GitPython is a Python library used to interact with Git repositories"
 groups = ["dev"]
 dependencies = [
     "gitdb<5,>=4.0.1",
+    "typing-extensions>=3.7.4.3; python_version < \"3.8\"",
 ]
 files = [
     {file = "GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff"},
@@ -588,7 +596,9 @@ requires_python = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 summary = "Human friendly output for text interfaces using Python"
 groups = ["dev"]
 dependencies = [
+    "monotonic; python_version == \"2.7\"",
     "pyreadline3; sys_platform == \"win32\" and python_version >= \"3.8\"",
+    "pyreadline; sys_platform == \"win32\" and python_version < \"3.8\"",
 ]
 files = [
     {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
@@ -613,6 +623,7 @@ requires_python = ">=3.8"
 summary = "Read metadata from Python packages"
 groups = ["dev"]
 dependencies = [
+    "typing-extensions>=3.6.4; python_version < \"3.8\"",
     "zipp>=0.5",
 ]
 files = [
@@ -1169,6 +1180,7 @@ groups = ["dev"]
 dependencies = [
     "charset-normalizer>=2.0.0",
     "cryptography>=36.0.0",
+    "typing-extensions; python_version < \"3.8\"",
 ]
 files = [
     {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"},
@@ -1829,6 +1841,7 @@ summary = "Format click help output nicely with rich"
 groups = ["dev"]
 dependencies = [
     "click>=7",
+    "importlib-metadata; python_version < \"3.8\"",
     "rich>=10.7",
     "typing-extensions",
 ]
@@ -2157,6 +2170,19 @@ files = [
     {file = "sparv_pipeline-5.2.0.tar.gz", hash = "sha256:d8282fd97ebae048e40b1cd83e087fad455ab137e107ebf54e2685b3af2f13b7"},
 ]
 
+[[package]]
+name = "sparv-pipeline-testing"
+version = "0.1.7"
+requires_python = ">=3.8.1,<3.12"
+git = "https://github.com/spraakbanken/sparv-pipeline-testing.git"
+ref = "v0.1.7"
+revision = "dd72b2f202e1164f6d1a4d731a81e9341d54dc13"
+summary = "Testing utilities for sparv-pipeline."
+groups = ["dev"]
+dependencies = [
+    "sparv-pipeline>=5.2.0",
+]
+
 [[package]]
 name = "sparv-sbx-ocr-correction-viklofg-sweocr"
 version = "0.2.3"
@@ -2521,6 +2547,7 @@ groups = ["dev"]
 dependencies = [
     "mypy-extensions>=0.3.0",
     "typing-extensions>=3.7.4",
+    "typing>=3.7.4; python_version < \"3.5\"",
 ]
 files = [
     {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"},
@@ -2557,6 +2584,9 @@ name = "wcwidth"
 version = "0.2.13"
 summary = "Measures the displayed width of unicode strings in a terminal"
 groups = ["dev"]
+dependencies = [
+    "backports-functools-lru-cache>=1.2.1; python_version < \"3.2\"",
+]
 files = [
     {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
     {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
diff --git a/pyproject.toml b/pyproject.toml
index a14bb93..2e77e0e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,4 +13,5 @@ dev = [
     "ruff>=0.3.2",
     "bump-my-version>=0.19.0",
     "syrupy>=4.0.0",
+    "sparv-pipeline-testing @ git+https://github.com/spraakbanken/sparv-pipeline-testing.git@v0.1.7",
 ]
diff --git a/pyrightconfig.json b/pyrightconfig.json
new file mode 100644
index 0000000..d88916d
--- /dev/null
+++ b/pyrightconfig.json
@@ -0,0 +1,3 @@
+{
+  "venv": "./.venv"
+}