From 83baf9cc50917b7de9609a42376d99f8af24344e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 13 Sep 2024 01:04:29 +0200
Subject: [PATCH 01/10] split up prediction to avoid overly large batches
 (causing OOM)

---
 ocrd_calamari/recognize.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 1ab11f5..35b5efb 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -43,6 +43,16 @@
 
 TOOL = "ocrd-calamari-recognize"
 
+BATCH_SIZE = 64
+if not hasattr(itertools, 'batched'):
+    def batched(iterable, n):
+        # batched('ABCDEFG', 3) → ABC DEF G
+        if n < 1:
+            raise ValueError('n must be at least one')
+        iterator = iter(iterable)
+        while batch := tuple(itertools.islice(iterator, n)):
+            yield batch
+    itertools.batched = batched
 
 class CalamariRecognize(Processor):
     def __init__(self, *args, **kwargs):
@@ -166,9 +176,11 @@ def process(self):
                         line_image_np = np.array(line_image, dtype=np.uint8)
                     line_images_np.append(line_image_np)
                     line_coordss.append(line_coords)
-                raw_results_all = self.predictor.predict_raw(
-                    line_images_np, progress_bar=False
-                )
+
+                # avoid too large a batch size (causing OOM on CPU or GPU)
+                fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
+                raw_results_all = itertools.chain.from_iterable(
+                    map(fun, itertools.batched(line_images_np, BATCH_SIZE)))
 
                 for line, line_coords, raw_results in zip(
                     textlines, line_coordss, raw_results_all

From bf755a38e6cf6a6c51ec1e727b91900cb567553f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Fri, 13 Sep 2024 11:12:53 +0200
Subject: [PATCH 02/10] adapt to ocrd>=3.0

---
 ocrd_calamari/config.py      |   5 -
 ocrd_calamari/ocrd-tool.json |   8 +-
 ocrd_calamari/recognize.py   | 512 ++++++++++++++++-------------------
 3 files changed, 241 insertions(+), 284 deletions(-)
 delete mode 100644 ocrd_calamari/config.py

diff --git a/ocrd_calamari/config.py b/ocrd_calamari/config.py
deleted file mode 100644
index 1729f8c..0000000
--- a/ocrd_calamari/config.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import json
-
-from pkg_resources import resource_string
-
-OCRD_TOOL = json.loads(resource_string(__name__, "ocrd-tool.json").decode("utf8"))
diff --git a/ocrd_calamari/ocrd-tool.json b/ocrd_calamari/ocrd-tool.json
index 6bdb971..6bce4cb 100644
--- a/ocrd_calamari/ocrd-tool.json
+++ b/ocrd_calamari/ocrd-tool.json
@@ -11,12 +11,8 @@
         "recognition/text-recognition"
       ],
       "description": "Recognize lines with Calamari",
-      "input_file_grp": [
-        "OCR-D-SEG-LINE"
-      ],
-      "output_file_grp": [
-        "OCR-D-OCR-CALAMARI"
-      ],
+      "input_file_grp_cardinality": 1,
+      "output_file_grp_cardinality": 1,
       "parameters": {
         "checkpoint_dir": {
           "description": "The directory containing calamari model files (*.ckpt.json). Uses all checkpoints in that directory",
diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 35b5efb..3f3d005 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -1,25 +1,20 @@
 from __future__ import absolute_import
 
+from typing import Optional
 import itertools
-import os
 from glob import glob
 
 import numpy as np
-from ocrd import Processor
-from ocrd_modelfactory import page_from_file
+from ocrd import Processor, OcrdPage, OcrdPageResult
 from ocrd_models.ocrd_page import (
     CoordsType,
     GlyphType,
     TextEquivType,
     WordType,
-    to_xml,
 )
 from ocrd_utils import (
-    MIMETYPE_PAGE,
-    assert_file_grp_cardinality,
+    VERSION as OCRD_VERSION,
     coordinates_for_segment,
-    getLogger,
-    make_file_id,
     points_from_polygon,
     polygon_from_x0y0x1y1,
     tf_disable_interactive_logs,
@@ -39,10 +34,6 @@
 
 # ruff: isort: on
 
-from ocrd_calamari.config import OCRD_TOOL
-
-TOOL = "ocrd-calamari-recognize"
-
 BATCH_SIZE = 64
 if not hasattr(itertools, 'batched'):
     def batched(iterable, n):
@@ -55,17 +46,14 @@ def batched(iterable, n):
     itertools.batched = batched
 
 class CalamariRecognize(Processor):
-    def __init__(self, *args, **kwargs):
-        kwargs["ocrd_tool"] = OCRD_TOOL["tools"][TOOL]
-        kwargs["version"] = "%s (calamari %s, tensorflow %s)" % (
-            OCRD_TOOL["version"],
-            calamari_version,
-            tensorflow_version,
-        )
-        super(CalamariRecognize, self).__init__(*args, **kwargs)
-        if hasattr(self, "output_file_grp"):
-            # processing context
-            self.setup()
+    # max_workers = 1
+
+    @property
+    def executable(self):
+        return 'ocrd-calamari-recognize'
+
+    def show_version(self):
+        print(f"Version {self.version}, calamari {calamari_version}, tensorflow {tensorflow_version}, ocrd/core {OCRD_VERSION}")
 
     def setup(self):
         """
@@ -95,287 +83,265 @@ def setup(self):
         voter_params.type = VoterParams.Type.Value(self.parameter["voter"].upper())
         self.voter = voter_from_proto(voter_params)
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """
-        Perform text recognition with Calamari on the workspace.
+        Perform text recognition with Calamari.
 
         If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word /
         glyph level segments by splitting at white space characters / glyph boundaries.
         In the case of ``glyph``, add all alternative character hypotheses down to
         ``glyph_conf_cutoff`` confidence threshold.
         """
-        log = getLogger("processor.CalamariRecognize")
-
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
-
-        for n, input_file in enumerate(self.input_files):
-            page_id = input_file.pageId or input_file.ID
-            log.info("INPUT FILE %i / %s", n, page_id)
-            pcgts = page_from_file(self.workspace.download_file(input_file))
+        pcgts = input_pcgts[0]
+        page = pcgts.get_Page()
+        page_image, page_coords, page_image_info = self.workspace.image_from_page(
+            page, page_id, feature_selector=self.features
+        )
 
-            page = pcgts.get_Page()
-            page_image, page_coords, page_image_info = self.workspace.image_from_page(
-                page, page_id, feature_selector=self.features
+        for region in page.get_AllRegions(classes=["Text"]):
+            region_image, region_coords = self.workspace.image_from_segment(
+                region, page_image, page_coords, feature_selector=self.features
             )
 
-            for region in page.get_AllRegions(classes=["Text"]):
-                region_image, region_coords = self.workspace.image_from_segment(
-                    region, page_image, page_coords, feature_selector=self.features
+            textlines = region.get_TextLine()
+            self.logger.info(
+                "About to recognize %i lines of region '%s'",
+                len(textlines),
+                region.id,
+            )
+            line_images_np = []
+            line_coordss = []
+            for line in textlines:
+                self.logger.debug(
+                    "Recognizing line '%s' in region '%s'", line.id, region.id
                 )
 
-                textlines = region.get_TextLine()
-                log.info(
-                    "About to recognize %i lines of region '%s'",
-                    len(textlines),
-                    region.id,
+                line_image, line_coords = self.workspace.image_from_segment(
+                    line,
+                    region_image,
+                    region_coords,
+                    feature_selector=self.features,
                 )
-                line_images_np = []
-                line_coordss = []
-                for line in textlines:
-                    log.debug(
-                        "Recognizing line '%s' in region '%s'", line.id, region.id
+                if (
+                    "binarized" not in line_coords["features"]
+                    and "grayscale_normalized" not in line_coords["features"]
+                    and self.network_input_channels == 1
+                ):
+                    # We cannot use a feature selector for this since we don't
+                    # know whether the model expects (has been trained on)
+                    # binarized or grayscale images; but raw images are likely
+                    # always inadequate:
+                    self.logger.warning(
+                        "Using raw image for line '%s' in region '%s'",
+                        line.id,
+                        region.id,
                     )
 
-                    line_image, line_coords = self.workspace.image_from_segment(
-                        line,
-                        region_image,
-                        region_coords,
-                        feature_selector=self.features,
+                if (
+                    not all(line_image.size)
+                    or line_image.height <= 8
+                    or line_image.width <= 8
+                    or "binarized" in line_coords["features"]
+                    and line_image.convert("1").getextrema()[0] == 255
+                ):
+                    # empty size or too tiny or no foreground at all: skip
+                    self.logger.warning(
+                        "Skipping empty line '%s' in region '%s'",
+                        line.id,
+                        region.id,
                     )
-                    if (
-                        "binarized" not in line_coords["features"]
-                        and "grayscale_normalized" not in line_coords["features"]
-                        and self.network_input_channels == 1
-                    ):
-                        # We cannot use a feature selector for this since we don't
-                        # know whether the model expects (has been trained on)
-                        # binarized or grayscale images; but raw images are likely
-                        # always inadequate:
-                        log.warning(
-                            "Using raw image for line '%s' in region '%s'",
-                            line.id,
-                            region.id,
+                    line_image_np = np.array([[0]], dtype=np.uint8)
+                else:
+                    line_image_np = np.array(line_image, dtype=np.uint8)
+                line_images_np.append(line_image_np)
+                line_coordss.append(line_coords)
+
+            # avoid too large a batch size (causing OOM on CPU or GPU)
+            fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
+            raw_results_all = itertools.chain.from_iterable(
+                map(fun, itertools.batched(line_images_np, BATCH_SIZE)))
+
+            for line, line_coords, raw_results in zip(
+                textlines, line_coordss, raw_results_all
+            ):
+                for i, p in enumerate(raw_results):
+                    p.prediction.id = "fold_{}".format(i)
+
+                prediction = self.voter.vote_prediction_result(raw_results)
+                prediction.id = "voted"
+
+                # Build line text on our own
+                #
+                # Calamari does whitespace post-processing on prediction.sentence,
+                # while it does not do the same on prediction.positions. Do it on
+                # our own to have consistency.
+                #
+                # XXX Check Calamari's built-in post-processing on
+                #     prediction.sentence
+
+                def _sort_chars(p):
+                    """Filter and sort chars of prediction p"""
+                    chars = p.chars
+                    chars = [
+                        c for c in chars if c.char
+                    ]  # XXX Note that omission probabilities are not normalized?!
+                    chars = [
+                        c
+                        for c in chars
+                        if c.probability >= self.parameter["glyph_conf_cutoff"]
+                    ]
+                    chars = sorted(chars, key=lambda k: k.probability, reverse=True)
+                    return chars
+
+                def _drop_leading_spaces(positions):
+                    return list(
+                        itertools.dropwhile(
+                            lambda p: _sort_chars(p)[0].char == " ", positions
                         )
+                    )
 
-                    if (
-                        not all(line_image.size)
-                        or line_image.height <= 8
-                        or line_image.width <= 8
-                        or "binarized" in line_coords["features"]
-                        and line_image.convert("1").getextrema()[0] == 255
-                    ):
-                        # empty size or too tiny or no foreground at all: skip
-                        log.warning(
-                            "Skipping empty line '%s' in region '%s'",
-                            line.id,
-                            region.id,
-                        )
-                        line_image_np = np.array([[0]], dtype=np.uint8)
-                    else:
-                        line_image_np = np.array(line_image, dtype=np.uint8)
-                    line_images_np.append(line_image_np)
-                    line_coordss.append(line_coords)
-
-                # avoid too large a batch size (causing OOM on CPU or GPU)
-                fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
-                raw_results_all = itertools.chain.from_iterable(
-                    map(fun, itertools.batched(line_images_np, BATCH_SIZE)))
-
-                for line, line_coords, raw_results in zip(
-                    textlines, line_coordss, raw_results_all
-                ):
-                    for i, p in enumerate(raw_results):
-                        p.prediction.id = "fold_{}".format(i)
-
-                    prediction = self.voter.vote_prediction_result(raw_results)
-                    prediction.id = "voted"
-
-                    # Build line text on our own
-                    #
-                    # Calamari does whitespace post-processing on prediction.sentence,
-                    # while it does not do the same on prediction.positions. Do it on
-                    # our own to have consistency.
-                    #
-                    # XXX Check Calamari's built-in post-processing on
-                    #     prediction.sentence
-
-                    def _sort_chars(p):
-                        """Filter and sort chars of prediction p"""
-                        chars = p.chars
-                        chars = [
-                            c for c in chars if c.char
-                        ]  # XXX Note that omission probabilities are not normalized?!
-                        chars = [
-                            c
-                            for c in chars
-                            if c.probability >= self.parameter["glyph_conf_cutoff"]
-                        ]
-                        chars = sorted(chars, key=lambda k: k.probability, reverse=True)
-                        return chars
-
-                    def _drop_leading_spaces(positions):
-                        return list(
-                            itertools.dropwhile(
-                                lambda p: _sort_chars(p)[0].char == " ", positions
-                            )
-                        )
+                def _drop_trailing_spaces(positions):
+                    return list(reversed(_drop_leading_spaces(reversed(positions))))
 
-                    def _drop_trailing_spaces(positions):
-                        return list(reversed(_drop_leading_spaces(reversed(positions))))
-
-                    def _drop_double_spaces(positions):
-                        def _drop_double_spaces_generator(positions):
-                            last_was_space = False
-                            for p in positions:
-                                if p.chars[0].char == " ":
-                                    if not last_was_space:
-                                        yield p
-                                    last_was_space = True
-                                else:
+                def _drop_double_spaces(positions):
+                    def _drop_double_spaces_generator(positions):
+                        last_was_space = False
+                        for p in positions:
+                            if p.chars[0].char == " ":
+                                if not last_was_space:
                                     yield p
-                                    last_was_space = False
-
-                        return list(_drop_double_spaces_generator(positions))
-
-                    positions = prediction.positions
-                    positions = _drop_leading_spaces(positions)
-                    positions = _drop_trailing_spaces(positions)
-                    positions = _drop_double_spaces(positions)
-                    positions = list(positions)
-
-                    line_text = "".join(_sort_chars(p)[0].char for p in positions)
-                    if line_text != prediction.sentence:
-                        log.warning(
-                            f"Our own line text is not the same as Calamari's:"
-                            f"'{line_text}' != '{prediction.sentence}'"
-                        )
-
-                    # Delete existing results
-                    if line.get_TextEquiv():
-                        log.warning("Line '%s' already contained text results", line.id)
-                    line.set_TextEquiv([])
-                    if line.get_Word():
-                        log.warning(
-                            "Line '%s' already contained word segmentation", line.id
-                        )
-                    line.set_Word([])
+                                last_was_space = True
+                            else:
+                                yield p
+                                last_was_space = False
+
+                    return list(_drop_double_spaces_generator(positions))
+
+                positions = prediction.positions
+                positions = _drop_leading_spaces(positions)
+                positions = _drop_trailing_spaces(positions)
+                positions = _drop_double_spaces(positions)
+                positions = list(positions)
+
+                line_text = "".join(_sort_chars(p)[0].char for p in positions)
+                if line_text != prediction.sentence:
+                    self.logger.warning(
+                        f"Our own line text is not the same as Calamari's:"
+                        f"'{line_text}' != '{prediction.sentence}'"
+                    )
 
-                    # Save line results
-                    line_conf = prediction.avg_char_probability
-                    line.set_TextEquiv(
-                        [TextEquivType(Unicode=line_text, conf=line_conf)]
+                # Delete existing results
+                if line.get_TextEquiv():
+                    self.logger.warning("Line '%s' already contained text results", line.id)
+                line.set_TextEquiv([])
+                if line.get_Word():
+                    self.logger.warning(
+                        "Line '%s' already contained word segmentation", line.id
                     )
+                line.set_Word([])
 
-                    # Save word results
-                    #
-                    # Calamari OCR does not provide word positions, so we infer word
-                    # positions from a. text segmentation and b. the glyph positions.
-                    # This is necessary because the PAGE XML format enforces a strict
-                    # hierarchy of lines > words > glyphs.
-
-                    def _words(s):
-                        """Split words based on spaces and include spaces as 'words'"""
-                        spaces = None
-                        word = ""
-                        for c in s:
-                            if c == " " and spaces is True:
-                                word += c
-                            elif c != " " and spaces is False:
-                                word += c
-                            else:
-                                if word:
-                                    yield word
-                                word = c
-                                spaces = c == " "
-                        yield word
-
-                    if self.parameter["textequiv_level"] in ["word", "glyph"]:
-                        word_no = 0
-                        i = 0
-
-                        for word_text in _words(line_text):
-                            word_length = len(word_text)
-                            if not all(c == " " for c in word_text):
-                                word_positions = positions[i : i + word_length]
-                                word_start = word_positions[0].global_start
-                                word_end = word_positions[-1].global_end
-
-                                polygon = polygon_from_x0y0x1y1(
-                                    [word_start, 0, word_end, line_image.height]
-                                )
-                                points = points_from_polygon(
-                                    coordinates_for_segment(polygon, None, line_coords)
-                                )
-                                # XXX Crop to line polygon?
+                # Save line results
+                line_conf = prediction.avg_char_probability
+                line.set_TextEquiv(
+                    [TextEquivType(Unicode=line_text, conf=line_conf)]
+                )
 
-                                word = WordType(
-                                    id="%s_word%04d" % (line.id, word_no),
-                                    Coords=CoordsType(points),
-                                )
-                                word.add_TextEquiv(TextEquivType(Unicode=word_text))
-
-                                if self.parameter["textequiv_level"] == "glyph":
-                                    for glyph_no, p in enumerate(word_positions):
-                                        glyph_start = p.global_start
-                                        glyph_end = p.global_end
-
-                                        polygon = polygon_from_x0y0x1y1(
-                                            [
-                                                glyph_start,
-                                                0,
-                                                glyph_end,
-                                                line_image.height,
-                                            ]
-                                        )
-                                        points = points_from_polygon(
-                                            coordinates_for_segment(
-                                                polygon, None, line_coords
-                                            )
-                                        )
+                # Save word results
+                #
+                # Calamari OCR does not provide word positions, so we infer word
+                # positions from a. text segmentation and b. the glyph positions.
+                # This is necessary because the PAGE XML format enforces a strict
+                # hierarchy of lines > words > glyphs.
+
+                def _words(s):
+                    """Split words based on spaces and include spaces as 'words'"""
+                    spaces = None
+                    word = ""
+                    for c in s:
+                        if c == " " and spaces is True:
+                            word += c
+                        elif c != " " and spaces is False:
+                            word += c
+                        else:
+                            if word:
+                                yield word
+                            word = c
+                            spaces = c == " "
+                    yield word
+
+                if self.parameter["textequiv_level"] in ["word", "glyph"]:
+                    word_no = 0
+                    i = 0
+
+                    for word_text in _words(line_text):
+                        word_length = len(word_text)
+                        if not all(c == " " for c in word_text):
+                            word_positions = positions[i : i + word_length]
+                            word_start = word_positions[0].global_start
+                            word_end = word_positions[-1].global_end
+
+                            polygon = polygon_from_x0y0x1y1(
+                                [word_start, 0, word_end, line_image.height]
+                            )
+                            points = points_from_polygon(
+                                coordinates_for_segment(polygon, None, line_coords)
+                            )
+                            # XXX Crop to line polygon?
 
-                                        glyph = GlyphType(
-                                            id="%s_glyph%04d" % (word.id, glyph_no),
-                                            Coords=CoordsType(points),
+                            word = WordType(
+                                id="%s_word%04d" % (line.id, word_no),
+                                Coords=CoordsType(points),
+                            )
+                            word.add_TextEquiv(TextEquivType(Unicode=word_text))
+
+                            if self.parameter["textequiv_level"] == "glyph":
+                                for glyph_no, p in enumerate(word_positions):
+                                    glyph_start = p.global_start
+                                    glyph_end = p.global_end
+
+                                    polygon = polygon_from_x0y0x1y1(
+                                        [
+                                            glyph_start,
+                                            0,
+                                            glyph_end,
+                                            line_image.height,
+                                        ]
+                                    )
+                                    points = points_from_polygon(
+                                        coordinates_for_segment(
+                                            polygon, None, line_coords
                                         )
-
-                                        # Add predictions (= TextEquivs)
-                                        char_index_start = 1
-                                        # Index must start with 1, see
-                                        # https://ocr-d.github.io/page#multiple-textequivs
-                                        for char_index, char in enumerate(
-                                            _sort_chars(p), start=char_index_start
-                                        ):
-                                            glyph.add_TextEquiv(
-                                                TextEquivType(
-                                                    Unicode=char.char,
-                                                    index=char_index,
-                                                    conf=char.probability,
-                                                )
+                                    )
+
+                                    glyph = GlyphType(
+                                        id="%s_glyph%04d" % (word.id, glyph_no),
+                                        Coords=CoordsType(points),
+                                    )
+
+                                    # Add predictions (= TextEquivs)
+                                    char_index_start = 1
+                                    # Index must start with 1, see
+                                    # https://ocr-d.github.io/page#multiple-textequivs
+                                    for char_index, char in enumerate(
+                                        _sort_chars(p), start=char_index_start
+                                    ):
+                                        glyph.add_TextEquiv(
+                                            TextEquivType(
+                                                Unicode=char.char,
+                                                index=char_index,
+                                                conf=char.probability,
                                             )
+                                        )
 
-                                        word.add_Glyph(glyph)
-
-                                line.add_Word(word)
-                                word_no += 1
-
-                            i += word_length
+                                    word.add_Glyph(glyph)
 
-            _page_update_higher_textequiv_levels("line", pcgts)
+                            line.add_Word(word)
+                            word_no += 1
 
-            # Add metadata about this operation and its runtime parameters:
-            self.add_metadata(pcgts)
-            file_id = make_file_id(input_file, self.output_file_grp)
-            pcgts.set_pcGtsId(file_id)
-            self.workspace.add_file(
-                file_id=file_id,
-                file_grp=self.output_file_grp,
-                page_id=input_file.pageId,
-                mimetype=MIMETYPE_PAGE,
-                local_filename=os.path.join(self.output_file_grp, file_id + ".xml"),
-                content=to_xml(pcgts),
-            )
+                        i += word_length
 
+        _page_update_higher_textequiv_levels("line", pcgts)
+        return OcrdPageResult(pcgts)
 
 # TODO: This is a copy of ocrd_tesserocr's function, and should probably be moved to a
 #       ocrd lib

From 1edd5e70cee7b701ff732990e25e723800ca402a Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 15 Sep 2024 15:58:32 +0200
Subject: [PATCH 03/10] make test: no assumption on OCRD resource location

---
 Makefile                      | 2 +-
 test/{base.py => conftest.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename test/{base.py => conftest.py} (100%)

diff --git a/Makefile b/Makefile
index f3164dc..42cf13e 100644
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,7 @@ install:
 $(MODEL):
 	ocrd resmgr download ocrd-calamari-recognize $@
 	# Workaround, see #91 https://github.com/OCR-D/ocrd_calamari/issues/91
-	fix-calamari1-model ~/.local/share/ocrd-resources/ocrd-calamari-recognize/$@
+	fix-calamari1-model `ocrd-calamari-recognize --resolve-resource $@`
 
 # Download example data (for the README)
 example: $(EXAMPLE)
diff --git a/test/base.py b/test/conftest.py
similarity index 100%
rename from test/base.py
rename to test/conftest.py

From 3333cab523d58244ea46f1da3f5d7aeaf1379d44 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Sun, 15 Sep 2024 15:58:33 +0200
Subject: [PATCH 04/10] tests: adapt to v3, overhaul and add caching+threading
 modes

---
 Makefile               |   2 +-
 test/conftest.py       |  65 +++++++++++-
 test/test_recognize.py | 232 ++++++++++++++++-------------------------
 3 files changed, 151 insertions(+), 148 deletions(-)

diff --git a/Makefile b/Makefile
index 42cf13e..eae9e2b 100644
--- a/Makefile
+++ b/Makefile
@@ -84,7 +84,7 @@ assets-clean:
 # Run unit tests
 test: test/assets $(MODEL)
 	# declare -p HTTP_PROXY
-	$(PYTHON) -m pytest --continue-on-collection-errors test $(PYTEST_ARGS)
+	$(PYTHON) -m pytest --continue-on-collection-errors --durations=0 test $(PYTEST_ARGS)
 
 # Run unit tests and determine test coverage
 coverage: test/assets $(MODEL)
diff --git a/test/conftest.py b/test/conftest.py
index d2dc025..2403cc7 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,7 +1,64 @@
-from test.assets import assets
+from multiprocessing import Process
+from time import sleep
+import pytest
 
-from ocrd_utils import initLogging
+from ocrd import Resolver, Workspace, OcrdMetsServer
+from ocrd_utils import pushd_popd, disableLogging, initLogging, setOverrideLogLevel, config
 
-initLogging()
+from .assets import assets
 
-__all__ = ["assets"]
+CONFIGS = ['', 'pageparallel', 'metscache', 'pageparallel+metscache']
+
+@pytest.fixture(params=CONFIGS)
+def workspace(tmpdir, pytestconfig, request):
+    def _make_workspace(workspace_path):
+        initLogging()
+        if pytestconfig.getoption('verbose') > 0:
+            setOverrideLogLevel('DEBUG')
+        with pushd_popd(tmpdir):
+            directory = str(tmpdir)
+            resolver = Resolver()
+            workspace = resolver.workspace_from_url(workspace_path, dst_dir=directory, download=True)
+            config.OCRD_MISSING_OUTPUT = "ABORT"
+            if 'metscache' in request.param:
+                config.OCRD_METS_CACHING = True
+                print("enabled METS caching")
+            if 'pageparallel' in request.param:
+                config.OCRD_MAX_PARALLEL_PAGES = 4
+                print("enabled page-parallel processing")
+                def _start_mets_server(*args, **kwargs):
+                    print("running with METS server")
+                    server = OcrdMetsServer(*args, **kwargs)
+                    server.startup()
+                process = Process(target=_start_mets_server,
+                                  kwargs={'workspace': workspace, 'url': 'mets.sock'})
+                process.start()
+                sleep(1)
+                workspace = Workspace(resolver, directory, mets_server_url='mets.sock')
+                yield {'workspace': workspace, 'mets_server_url': 'mets.sock'}
+                process.terminate()
+            else:
+                yield {'workspace': workspace}
+        config.reset_defaults()
+    return _make_workspace
+
+
+@pytest.fixture
+def workspace_manifesto(workspace):
+    yield from workspace(assets.path_to('communist_manifesto/data/mets.xml'))
+
+@pytest.fixture
+def workspace_aufklaerung(workspace):
+    yield from workspace(assets.path_to('kant_aufklaerung_1784/data/mets.xml'))
+
+@pytest.fixture
+def workspace_aufklaerung_binarized(workspace):
+    yield from workspace(assets.path_to('kant_aufklaerung_1784-binarized/data/mets.xml'))
+
+@pytest.fixture
+def workspace_aufklaerung_glyph(workspace):
+    yield from workspace(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml'))
+
+@pytest.fixture
+def workspace_sbb(workspace):
+    yield from workspace(assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'))
diff --git a/test/test_recognize.py b/test/test_recognize.py
index f4e3587..3419214 100644
--- a/test/test_recognize.py
+++ b/test/test_recognize.py
@@ -2,197 +2,143 @@
 import os
 import shutil
 import subprocess
-import tempfile
 
-import pytest
 from lxml import etree
-from ocrd.resolver import Resolver
 
+from ocrd import run_processor
+from ocrd_utils import MIMETYPE_PAGE as PAGE
+from ocrd_models.constants import NAMESPACES as NS
+from ocrd_modelfactory import page_from_file
 from ocrd_calamari import CalamariRecognize
 
-from .base import assets
-
-METS_KANT = assets.url_of(
-    "kant_aufklaerung_1784-page-region-line-word_glyph/data/mets.xml"
-)
-WORKSPACE_DIR = tempfile.mkdtemp(prefix="test-ocrd-calamari-")
 CHECKPOINT_DIR = os.getenv("MODEL", "qurator-gt4histocr-1.0")
 DEBUG = os.getenv("DEBUG", False)
 
 
-def page_namespace(tree):
-    """Return the PAGE content namespace used in the given ElementTree.
-
-    This relies on the assumption that, in any given PAGE content file, the root element
-    has the local name "PcGts". We do not check if the files uses any valid PAGE
-    namespace.
-    """
-    root_name = etree.QName(tree.getroot().tag)
-    if root_name.localname == "PcGts":
-        return root_name.namespace
-    else:
-        raise ValueError("Not a PAGE tree")
-
-
-def assertFileContains(fn, text):
+def assertFileContains(fn, text, msg=""):
     """Assert that the given file contains a given string."""
     with open(fn, "r", encoding="utf-8") as f:
-        assert text in f.read()
+        assert text in f.read(), msg
 
 
-def assertFileDoesNotContain(fn, text):
+def assertFileDoesNotContain(fn, text, msg=""):
     """Assert that the given file does not contain given string."""
     with open(fn, "r", encoding="utf-8") as f:
-        assert text not in f.read()
-
-
-@pytest.fixture
-def workspace():
-    if os.path.exists(WORKSPACE_DIR):
-        shutil.rmtree(WORKSPACE_DIR)
-    os.makedirs(WORKSPACE_DIR)
-
-    resolver = Resolver()
-    # due to core#809 this does not always work:
-    # workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR)
-    # workaround:
-    shutil.rmtree(WORKSPACE_DIR)
-    shutil.copytree(os.path.dirname(METS_KANT), WORKSPACE_DIR)
-    workspace = resolver.workspace_from_url(os.path.join(WORKSPACE_DIR, "mets.xml"))
-
-    # The binarization options I have are:
-    #
-    # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my
-    #    machine (protobuf)
-    # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency
-    #    olena doesn't compile on my machine
-    # c. just fumble with the original files
-    #
-    # So I'm going for option c.
-    for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"):
-        imgf = workspace.download_file(imgf)
-        path = os.path.join(workspace.directory, imgf.local_filename)
-        subprocess.call(["mogrify", "-threshold", "50%", path])
-
-    # Remove GT Words and TextEquivs, to not accidently check GT text instead of the
-    # OCR text
-    # XXX Review data again
-    for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"):
-        workspace.download_file(of)
-        path = os.path.join(workspace.directory, of.local_filename)
-        tree = etree.parse(path)
-        nsmap_gt = {"pc": page_namespace(tree)}
-        for to_remove in ["//pc:Word", "//pc:TextEquiv"]:
-            for e in tree.xpath(to_remove, namespaces=nsmap_gt):
-                e.getparent().remove(e)
-        tree.write(path, xml_declaration=True, encoding="utf-8")
-        assertFileDoesNotContain(path, "TextEquiv")
-
-    yield workspace
-
-    if not DEBUG:
-        shutil.rmtree(WORKSPACE_DIR)
-
-
-def test_recognize(workspace):
-    CalamariRecognize(
-        workspace,
-        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
+        assert text not in f.read(), msg
+
+
+
+def test_recognize(workspace_aufklaerung_binarized, caplog):
+    caplog.set_level(logging.WARNING)
+    ws = workspace_aufklaerung_binarized['workspace']
+    page1 = ws.mets.physical_pages[0]
+    file1 = list(ws.find_files(file_grp="OCR-D-GT-WORD", page_id=page1, mimetype=PAGE))[0]
+    text1 = page_from_file(file1).etree.xpath(
+        '//page:TextLine/page:TextEquiv[1]/page:Unicode/text()', namespaces=NS)
+    assert len(text1) > 10
+    assert "verſchuldeten" in "\n".join(text1)
+    run_processor(
+        CalamariRecognize,
+        input_file_grp="OCR-D-GT-WORD",
         output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={
             "checkpoint_dir": CHECKPOINT_DIR,
         },
-    ).process()
-    workspace.save_mets()
-
-    page1 = os.path.join(
-        workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_phys_0001.xml"
+        **workspace_aufklaerung_binarized,
     )
-    assert os.path.exists(page1)
-    assertFileContains(page1, "verſchuldeten")
+    overwrite_text_log_messages = [t[2] for t in caplog.record_tuples
+                                   if "already contained text results" in t[2]]
+    assert len(overwrite_text_log_messages) > 10  # For every line!
+    overwrite_word_log_messages = [t[2] for t in caplog.record_tuples
+                                   if "already contained word segmentation" in t[2]]
+    assert len(overwrite_word_log_messages) > 10  # For every line!
+    ws.save_mets()
+    file1 = next(ws.find_files(file_grp="OCR-D-OCR-CALAMARI", page_id=page1, mimetype=PAGE), False)
+    assert file1, "result for first page not referenced in METS"
+    assert os.path.exists(file1.local_filename), "result for first page not found in filesystem"
+    text1_out = page_from_file(file1).etree.xpath(
+        '//page:TextLine/page:TextEquiv[1]/page:Unicode/text()', namespaces=NS)
+    assert len(text1_out) == len(text1), "not all lines have been recognized"
+    assert "verſchuldeten" in "\n".join(text1_out), "result for first page is inaccurate"
+    assert "\n".join(text1_out) != "\n".join(text1), "result is suspiciously identical to GT"
 
 
 def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model(
-    workspace, caplog
+    workspace_aufklaerung, caplog
 ):
     caplog.set_level(logging.WARNING)
-    CalamariRecognize(
-        workspace,
-        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
-        output_file_grp="OCR-D-OCR-CALAMARI-BROKEN",
+    run_processor(
+        CalamariRecognize,
+        input_file_grp="OCR-D-GT-PAGE",
+        output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={"checkpoint_dir": CHECKPOINT_DIR},
-    ).process()
-
-    interesting_log_messages = [
-        t[2] for t in caplog.record_tuples if "Using raw image" in t[2]
-    ]
+        **workspace_aufklaerung,
+    )
+    interesting_log_messages = [t[2] for t in caplog.record_tuples
+                                if "Using raw image" in t[2]]
     assert len(interesting_log_messages) > 10  # For every line!
 
 
-def test_word_segmentation(workspace):
-    CalamariRecognize(
-        workspace,
-        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
+def test_word_segmentation(workspace_aufklaerung_binarized):
+    run_processor(
+        CalamariRecognize,
+        input_file_grp="OCR-D-GT-WORD",
         output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={
             "checkpoint_dir": CHECKPOINT_DIR,
-            "textequiv_level": "word",  # Note that we're going down to word level here
+            "textequiv_level": "word",
         },
-    ).process()
-    workspace.save_mets()
-
-    page1 = os.path.join(
-        workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_phys_0001.xml"
+        **workspace_aufklaerung_binarized
     )
-    assert os.path.exists(page1)
-    tree = etree.parse(page1)
-    nsmap = {"pc": page_namespace(tree)}
-
-    # The result should contain a TextLine that contains the text "December"
-    line = tree.xpath(
-        ".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]",
-        namespaces=nsmap,
-    )[0]
-    assert line is not None
-
+    ws = workspace_aufklaerung_binarized['workspace']
+    ws.save_mets()
+    page1 = ws.mets.physical_pages[0]
+    file1 = next(ws.find_files(file_grp="OCR-D-OCR-CALAMARI", page_id=page1, mimetype=PAGE), False)
+    assert file1, "result for first page not referenced in METS"
+    assert os.path.exists(file1.local_filename), "result for first page not found in filesystem"
+    tree1 = page_from_file(file1).etree
+    # The result should contain a TextLine that contains the text "Berliniſche"
+    line = tree1.xpath(
+        "//page:TextLine[page:TextEquiv/page:Unicode[contains(text(),'Berliniſche')]]",
+        namespaces=NS,
+    )
+    assert len(line) == 1, "result is inaccurate"
+    line = line[0]
     # The textline should
     # a. contain multiple words and
     # b. these should concatenate fine to produce the same line text
-    words = line.xpath(".//pc:Word", namespaces=nsmap)
-    assert len(words) >= 2
+    words = line.xpath(".//page:Word", namespaces=NS)
+    assert len(words) >= 2, "result does not contain words"
     words_text = " ".join(
-        word.xpath("pc:TextEquiv/pc:Unicode", namespaces=nsmap)[0].text
+        word.xpath("page:TextEquiv[1]/page:Unicode/text()", namespaces=NS)[0]
         for word in words
     )
-    line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=nsmap)[0].text
-    assert words_text == line_text
-
+    line_text = line.xpath("page:TextEquiv[1]/page:Unicode/text()", namespaces=NS)[0]
+    assert words_text == line_text, "word-level text result does not concatenate to line-level text result"
     # For extra measure, check that we're not seeing any glyphs, as we asked for
     # textequiv_level == "word"
-    glyphs = tree.xpath("//pc:Glyph", namespaces=nsmap)
-    assert len(glyphs) == 0
+    glyphs = tree1.xpath("//page:Glyph", namespaces=NS)
+    assert len(glyphs) == 0, "result must not contain glyph-level segments"
 
 
-def test_glyphs(workspace):
-    CalamariRecognize(
-        workspace,
-        input_file_grp="OCR-D-GT-SEG-WORD-GLYPH",
+def test_glyphs(workspace_aufklaerung_binarized):
+    run_processor(
+        CalamariRecognize,
+        input_file_grp="OCR-D-GT-WORD",
         output_file_grp="OCR-D-OCR-CALAMARI",
         parameter={
             "checkpoint_dir": CHECKPOINT_DIR,
-            # Note that we're going down to glyph level here
             "textequiv_level": "glyph",
         },
-    ).process()
-    workspace.save_mets()
-
-    page1 = os.path.join(
-        workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_phys_0001.xml"
+        **workspace_aufklaerung_binarized,
     )
-    assert os.path.exists(page1)
-    tree = etree.parse(page1)
-    nsmap = {"pc": page_namespace(tree)}
-
+    ws = workspace_aufklaerung_binarized['workspace']
+    ws.save_mets()
+    page1 = ws.mets.physical_pages[0]
+    file1 = next(ws.find_files(file_grp="OCR-D-OCR-CALAMARI", page_id=page1, mimetype=PAGE), False)
+    assert file1, "result for first page not referenced in METS"
+    assert os.path.exists(file1.local_filename), "result for first page not found in filesystem"
+    tree1 = page_from_file(file1).etree
     # The result should contain a lot of glyphs
-    glyphs = tree.xpath("//pc:Glyph", namespaces=nsmap)
-    assert len(glyphs) >= 100
+    glyphs = tree1.xpath("//page:Glyph", namespaces=NS)
+    assert len(glyphs) >= 100, "result must contain lots of glyphs"

From 7aae9bca8a87db994045107e8092cf25395612a9 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Mon, 16 Sep 2024 13:40:53 +0200
Subject: [PATCH 05/10] require ocrd 3.0 and calamari-ocr 1.0.7

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5eebd46..b637015 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 tensorflow >= 2.5.0, < 2.16
 numpy
-calamari-ocr == 1.0.*, >= 1.0.6
+calamari-ocr == 1.0.*, >= 1.0.7
 setuptools >= 41.0.0  # tensorboard depends on this, but why do we get an error at runtime?
 click
-ocrd >= 2.54.0
+ocrd >= 3.0.0b5

From 9611e2c1ffb27d148b6253719dfb5a686b15a55e Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Tue, 17 Sep 2024 18:13:28 +0000
Subject: [PATCH 06/10] aggregate all lines instead of per region to better
 utilise batched predictor

---
 ocrd_calamari/recognize.py | 356 ++++++++++++++++++-------------------
 1 file changed, 178 insertions(+), 178 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 3f3d005..c3fbd04 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -61,7 +61,7 @@ def setup(self):
         """
         resolved = self.resolve_resource(self.parameter["checkpoint_dir"])
         checkpoints = glob("%s/*.ckpt.json" % resolved)
-        self.predictor = MultiPredictor(checkpoints=checkpoints)
+        self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE)
 
         self.network_input_channels = self.predictor.predictors[
             0
@@ -98,6 +98,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
             page, page_id, feature_selector=self.features
         )
 
+        lines = []
         for region in page.get_AllRegions(classes=["Text"]):
             region_image, region_coords = self.workspace.image_from_segment(
                 region, page_image, page_coords, feature_selector=self.features
@@ -109,8 +110,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                 len(textlines),
                 region.id,
             )
-            line_images_np = []
-            line_coordss = []
             for line in textlines:
                 self.logger.debug(
                     "Recognizing line '%s' in region '%s'", line.id, region.id
@@ -150,195 +149,196 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
                         line.id,
                         region.id,
                     )
-                    line_image_np = np.array([[0]], dtype=np.uint8)
-                else:
-                    line_image_np = np.array(line_image, dtype=np.uint8)
-                line_images_np.append(line_image_np)
-                line_coordss.append(line_coords)
-
-            # avoid too large a batch size (causing OOM on CPU or GPU)
-            fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
-            raw_results_all = itertools.chain.from_iterable(
-                map(fun, itertools.batched(line_images_np, BATCH_SIZE)))
-
-            for line, line_coords, raw_results in zip(
-                textlines, line_coordss, raw_results_all
-            ):
-                for i, p in enumerate(raw_results):
-                    p.prediction.id = "fold_{}".format(i)
-
-                prediction = self.voter.vote_prediction_result(raw_results)
-                prediction.id = "voted"
-
-                # Build line text on our own
-                #
-                # Calamari does whitespace post-processing on prediction.sentence,
-                # while it does not do the same on prediction.positions. Do it on
-                # our own to have consistency.
-                #
-                # XXX Check Calamari's built-in post-processing on
-                #     prediction.sentence
-
-                def _sort_chars(p):
-                    """Filter and sort chars of prediction p"""
-                    chars = p.chars
-                    chars = [
-                        c for c in chars if c.char
-                    ]  # XXX Note that omission probabilities are not normalized?!
-                    chars = [
-                        c
-                        for c in chars
-                        if c.probability >= self.parameter["glyph_conf_cutoff"]
-                    ]
-                    chars = sorted(chars, key=lambda k: k.probability, reverse=True)
-                    return chars
-
-                def _drop_leading_spaces(positions):
-                    return list(
-                        itertools.dropwhile(
-                            lambda p: _sort_chars(p)[0].char == " ", positions
-                        )
+                    continue
+                lines.append((line, line_coords, np.array(line_image, dtype=np.uint8)))
+
+        if not len(lines):
+            self.logger.warning("No text lines on page '%s'", page_id)
+            return OcrdPageResult(pcgts)
+
+        lines, coords, images = zip(*lines)
+        # not exposed in MultiPredictor yet, cf. calamari#361:
+        # results = self.predictor.predict_raw(images, progress_bar=False, batch_size=BATCH_SIZE)
+        # avoid too large a batch size (causing OOM on CPU or GPU)
+        fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
+        results = itertools.chain.from_iterable(
+            map(fun, itertools.batched(images, BATCH_SIZE)))
+        for line, line_coords, raw_results in zip(lines, coords, results):
+            for i, p in enumerate(raw_results):
+                p.prediction.id = "fold_{}".format(i)
+
+            prediction = self.voter.vote_prediction_result(raw_results)
+            prediction.id = "voted"
+
+            # Build line text on our own
+            #
+            # Calamari does whitespace post-processing on prediction.sentence,
+            # while it does not do the same on prediction.positions. Do it on
+            # our own to have consistency.
+            #
+            # XXX Check Calamari's built-in post-processing on
+            #     prediction.sentence
+
+            def _sort_chars(p):
+                """Filter and sort chars of prediction p"""
+                chars = p.chars
+                chars = [
+                    c for c in chars if c.char
+                ]  # XXX Note that omission probabilities are not normalized?!
+                chars = [
+                    c
+                    for c in chars
+                    if c.probability >= self.parameter["glyph_conf_cutoff"]
+                ]
+                chars = sorted(chars, key=lambda k: k.probability, reverse=True)
+                return chars
+
+            def _drop_leading_spaces(positions):
+                return list(
+                    itertools.dropwhile(
+                        lambda p: _sort_chars(p)[0].char == " ", positions
                     )
+                )
 
-                def _drop_trailing_spaces(positions):
-                    return list(reversed(_drop_leading_spaces(reversed(positions))))
-
-                def _drop_double_spaces(positions):
-                    def _drop_double_spaces_generator(positions):
-                        last_was_space = False
-                        for p in positions:
-                            if p.chars[0].char == " ":
-                                if not last_was_space:
-                                    yield p
-                                last_was_space = True
-                            else:
-                                yield p
-                                last_was_space = False
-
-                    return list(_drop_double_spaces_generator(positions))
-
-                positions = prediction.positions
-                positions = _drop_leading_spaces(positions)
-                positions = _drop_trailing_spaces(positions)
-                positions = _drop_double_spaces(positions)
-                positions = list(positions)
-
-                line_text = "".join(_sort_chars(p)[0].char for p in positions)
-                if line_text != prediction.sentence:
-                    self.logger.warning(
-                        f"Our own line text is not the same as Calamari's:"
-                        f"'{line_text}' != '{prediction.sentence}'"
-                    )
+            def _drop_trailing_spaces(positions):
+                return list(reversed(_drop_leading_spaces(reversed(positions))))
 
-                # Delete existing results
-                if line.get_TextEquiv():
-                    self.logger.warning("Line '%s' already contained text results", line.id)
-                line.set_TextEquiv([])
-                if line.get_Word():
-                    self.logger.warning(
-                        "Line '%s' already contained word segmentation", line.id
-                    )
-                line.set_Word([])
+            def _drop_double_spaces(positions):
+                def _drop_double_spaces_generator(positions):
+                    last_was_space = False
+                    for p in positions:
+                        if p.chars[0].char == " ":
+                            if not last_was_space:
+                                yield p
+                            last_was_space = True
+                        else:
+                            yield p
+                            last_was_space = False
+
+                return list(_drop_double_spaces_generator(positions))
+
+            positions = prediction.positions
+            positions = _drop_leading_spaces(positions)
+            positions = _drop_trailing_spaces(positions)
+            positions = _drop_double_spaces(positions)
+            positions = list(positions)
+
+            line_text = "".join(_sort_chars(p)[0].char for p in positions)
+            if line_text != prediction.sentence:
+                self.logger.warning(
+                    f"Our own line text is not the same as Calamari's:"
+                    f"'{line_text}' != '{prediction.sentence}'"
+                )
 
-                # Save line results
-                line_conf = prediction.avg_char_probability
-                line.set_TextEquiv(
-                    [TextEquivType(Unicode=line_text, conf=line_conf)]
+            # Delete existing results
+            if line.get_TextEquiv():
+                self.logger.warning("Line '%s' already contained text results", line.id)
+            line.set_TextEquiv([])
+            if line.get_Word():
+                self.logger.warning(
+                    "Line '%s' already contained word segmentation", line.id
                 )
+            line.set_Word([])
 
-                # Save word results
-                #
-                # Calamari OCR does not provide word positions, so we infer word
-                # positions from a. text segmentation and b. the glyph positions.
-                # This is necessary because the PAGE XML format enforces a strict
-                # hierarchy of lines > words > glyphs.
-
-                def _words(s):
-                    """Split words based on spaces and include spaces as 'words'"""
-                    spaces = None
-                    word = ""
-                    for c in s:
-                        if c == " " and spaces is True:
-                            word += c
-                        elif c != " " and spaces is False:
-                            word += c
-                        else:
-                            if word:
-                                yield word
-                            word = c
-                            spaces = c == " "
-                    yield word
-
-                if self.parameter["textequiv_level"] in ["word", "glyph"]:
-                    word_no = 0
-                    i = 0
-
-                    for word_text in _words(line_text):
-                        word_length = len(word_text)
-                        if not all(c == " " for c in word_text):
-                            word_positions = positions[i : i + word_length]
-                            word_start = word_positions[0].global_start
-                            word_end = word_positions[-1].global_end
-
-                            polygon = polygon_from_x0y0x1y1(
-                                [word_start, 0, word_end, line_image.height]
-                            )
-                            points = points_from_polygon(
-                                coordinates_for_segment(polygon, None, line_coords)
-                            )
-                            # XXX Crop to line polygon?
+            # Save line results
+            line_conf = prediction.avg_char_probability
+            line.set_TextEquiv(
+                [TextEquivType(Unicode=line_text, conf=line_conf)]
+            )
 
-                            word = WordType(
-                                id="%s_word%04d" % (line.id, word_no),
-                                Coords=CoordsType(points),
-                            )
-                            word.add_TextEquiv(TextEquivType(Unicode=word_text))
-
-                            if self.parameter["textequiv_level"] == "glyph":
-                                for glyph_no, p in enumerate(word_positions):
-                                    glyph_start = p.global_start
-                                    glyph_end = p.global_end
-
-                                    polygon = polygon_from_x0y0x1y1(
-                                        [
-                                            glyph_start,
-                                            0,
-                                            glyph_end,
-                                            line_image.height,
-                                        ]
-                                    )
-                                    points = points_from_polygon(
-                                        coordinates_for_segment(
-                                            polygon, None, line_coords
-                                        )
-                                    )
+            # Save word results
+            #
+            # Calamari OCR does not provide word positions, so we infer word
+            # positions from a. text segmentation and b. the glyph positions.
+            # This is necessary because the PAGE XML format enforces a strict
+            # hierarchy of lines > words > glyphs.
+
+            def _words(s):
+                """Split words based on spaces and include spaces as 'words'"""
+                spaces = None
+                word = ""
+                for c in s:
+                    if c == " " and spaces is True:
+                        word += c
+                    elif c != " " and spaces is False:
+                        word += c
+                    else:
+                        if word:
+                            yield word
+                        word = c
+                        spaces = c == " "
+                yield word
+
+            if self.parameter["textequiv_level"] in ["word", "glyph"]:
+                word_no = 0
+                i = 0
+
+                for word_text in _words(line_text):
+                    word_length = len(word_text)
+                    if not all(c == " " for c in word_text):
+                        word_positions = positions[i : i + word_length]
+                        word_start = word_positions[0].global_start
+                        word_end = word_positions[-1].global_end
+
+                        polygon = polygon_from_x0y0x1y1(
+                            [word_start, 0, word_end, line_image.height]
+                        )
+                        points = points_from_polygon(
+                            coordinates_for_segment(polygon, None, line_coords)
+                        )
+                        # XXX Crop to line polygon?
 
-                                    glyph = GlyphType(
-                                        id="%s_glyph%04d" % (word.id, glyph_no),
-                                        Coords=CoordsType(points),
+                        word = WordType(
+                            id="%s_word%04d" % (line.id, word_no),
+                            Coords=CoordsType(points),
+                        )
+                        word.add_TextEquiv(TextEquivType(Unicode=word_text))
+
+                        if self.parameter["textequiv_level"] == "glyph":
+                            for glyph_no, p in enumerate(word_positions):
+                                glyph_start = p.global_start
+                                glyph_end = p.global_end
+
+                                polygon = polygon_from_x0y0x1y1(
+                                    [
+                                        glyph_start,
+                                        0,
+                                        glyph_end,
+                                        line_image.height,
+                                    ]
+                                )
+                                points = points_from_polygon(
+                                    coordinates_for_segment(
+                                        polygon, None, line_coords
                                     )
+                                )
 
-                                    # Add predictions (= TextEquivs)
-                                    char_index_start = 1
-                                    # Index must start with 1, see
-                                    # https://ocr-d.github.io/page#multiple-textequivs
-                                    for char_index, char in enumerate(
-                                        _sort_chars(p), start=char_index_start
-                                    ):
-                                        glyph.add_TextEquiv(
-                                            TextEquivType(
-                                                Unicode=char.char,
-                                                index=char_index,
-                                                conf=char.probability,
-                                            )
+                                glyph = GlyphType(
+                                    id="%s_glyph%04d" % (word.id, glyph_no),
+                                    Coords=CoordsType(points),
+                                )
+
+                                # Add predictions (= TextEquivs)
+                                char_index_start = 1
+                                # Index must start with 1, see
+                                # https://ocr-d.github.io/page#multiple-textequivs
+                                for char_index, char in enumerate(
+                                    _sort_chars(p), start=char_index_start
+                                ):
+                                    glyph.add_TextEquiv(
+                                        TextEquivType(
+                                            Unicode=char.char,
+                                            index=char_index,
+                                            conf=char.probability,
                                         )
+                                    )
 
-                                    word.add_Glyph(glyph)
+                                word.add_Glyph(glyph)
 
-                            line.add_Word(word)
-                            word_no += 1
+                        line.add_Word(word)
+                        word_no += 1
 
-                        i += word_length
+                    i += word_length
 
         _page_update_higher_textequiv_levels("line", pcgts)
         return OcrdPageResult(pcgts)

From fb2a68018990122440cdf300926118d4d6fd3849 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 12:31:13 +0000
Subject: [PATCH 07/10] run prediction in bg thread (shared across pages to
 interleave CPU/GPU)

---
 ocrd_calamari/recognize.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index c3fbd04..846fccd 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -3,6 +3,7 @@
 from typing import Optional
 import itertools
 from glob import glob
+from concurrent.futures import ThreadPoolExecutor
 
 import numpy as np
 from ocrd import Processor, OcrdPage, OcrdPageResult
@@ -46,8 +47,6 @@ def batched(iterable, n):
     itertools.batched = batched
 
 class CalamariRecognize(Processor):
-    # max_workers = 1
-
     @property
     def executable(self):
         return 'ocrd-calamari-recognize'
@@ -83,6 +82,9 @@ def setup(self):
         voter_params.type = VoterParams.Type.Value(self.parameter["voter"].upper())
         self.voter = voter_from_proto(voter_params)
 
+        # run in a background thread so GPU parts can be interleaved with CPU pre-/post-processing across pages
+        self.executor = ThreadPoolExecutor(max_workers=1)
+
     def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
         """
         Perform text recognition with Calamari.
@@ -158,9 +160,9 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional
 
         lines, coords, images = zip(*lines)
         # not exposed in MultiPredictor yet, cf. calamari#361:
-        # results = self.predictor.predict_raw(images, progress_bar=False, batch_size=BATCH_SIZE)
+        # results = self.executor.submit(self.predictor.predict_raw, images, progress_bar=False, batch_size=BATCH_SIZE).result()
         # avoid too large a batch size (causing OOM on CPU or GPU)
-        fun = lambda x: self.predictor.predict_raw(x, progress_bar=False)
+        fun = lambda x: self.executor.submit(self.predictor.predict_raw, x, progress_bar=False).result()
         results = itertools.chain.from_iterable(
             map(fun, itertools.batched(images, BATCH_SIZE)))
         for line, line_coords, raw_results in zip(lines, coords, results):

From b9b0e13501af05ceb0b635ef45301ad6c9177af6 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 12:31:58 +0000
Subject: [PATCH 08/10] let GPU memory grow by demand (instead of exclusive
 reservation)

---
 ocrd_calamari/recognize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ocrd_calamari/recognize.py b/ocrd_calamari/recognize.py
index 846fccd..02d3702 100644
--- a/ocrd_calamari/recognize.py
+++ b/ocrd_calamari/recognize.py
@@ -32,6 +32,7 @@
 from calamari_ocr.ocr import MultiPredictor
 from calamari_ocr.ocr.voting import voter_from_proto
 from calamari_ocr.proto import VoterParams
+from tensorflow import config as tensorflow_config
 
 # ruff: isort: on
 
@@ -58,6 +59,10 @@ def setup(self):
         """
         Set up the model prior to processing.
         """
+        devices = tensorflow_config.list_physical_devices("GPU")
+        for device in devices:
+            self.logger.info("using GPU device %s", device)
+            tensorflow_config.experimental.set_memory_growth(device, True)
         resolved = self.resolve_resource(self.parameter["checkpoint_dir"])
         checkpoints = glob("%s/*.ckpt.json" % resolved)
         self.predictor = MultiPredictor(checkpoints=checkpoints, batch_size=BATCH_SIZE)

From 46c2ef6c2587980e3215d1bb690e42b742ac315f Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 13:49:38 +0000
Subject: [PATCH 09/10] no more need for model fixup

---
 Makefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Makefile b/Makefile
index eae9e2b..e7fbc83 100644
--- a/Makefile
+++ b/Makefile
@@ -40,8 +40,6 @@ install:
 
 $(MODEL):
 	ocrd resmgr download ocrd-calamari-recognize $@
-	# Workaround, see #91 https://github.com/OCR-D/ocrd_calamari/issues/91
-	fix-calamari1-model `ocrd-calamari-recognize --resolve-resource $@`
 
 # Download example data (for the README)
 example: $(EXAMPLE)

From e68ce5f96299fc0e9443ece8e3e1e84fb52460c4 Mon Sep 17 00:00:00 2001
From: Robert Sachunsky <sachunsky@informatik.uni-leipzig.de>
Date: Wed, 18 Sep 2024 13:53:18 +0000
Subject: [PATCH 10/10] CI: increase RAM

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a17560d..43fa5cc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -28,6 +28,7 @@ jobs:
           key: v01-pydeps-<< parameters.python-image >>-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
           paths:
             - "~/.cache/pip"
+    resource_class: large
 
 workflows:
   build: