From ac5e7b5214aefb0968715a7433ece52b749a3ae2 Mon Sep 17 00:00:00 2001
From: Benjamin Kiessling <mittagessen@l.unchti.me>
Date: Thu, 9 May 2024 16:49:35 +0200
Subject: [PATCH] 5.x dataset from object regression

and some basic tests so this doesn't happen again
---
 kraken/lib/arrow_dataset.py                   |  9 +--
 tests/resources/170025120000003,0074-lite.xml |  2 +-
 tests/test_arrow_dataset.py                   | 72 ++++++++++++++++---
 3 files changed, 70 insertions(+), 13 deletions(-)
diff --git a/kraken/lib/arrow_dataset.py b/kraken/lib/arrow_dataset.py
index 3d4b79a0f..bf3423266 100755
--- a/kraken/lib/arrow_dataset.py
+++ b/kraken/lib/arrow_dataset.py
@@ -52,8 +52,7 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo
         return lines, None, None
     if is_bitonal(im):
         im = im.convert('1')
-    recs = xml_record.lines.values()
-    for idx, rec in enumerate(recs):
+    for idx, rec in enumerate(xml_record.lines):
         seg = Segmentation(text_direction='horizontal-lr',
                            imagename=xml_record.imagename,
                            type=xml_record.type,
@@ -167,6 +166,8 @@ def build_binary_dataset(files: Optional[List[Union[str, 'PathLike', 'Segmentati
         for doc in files:
             try:
                 data = parse_fn(doc)
+                if format_type in ['xml', 'alto', 'page']:
+                    data = data.to_container()
             except (FileNotFoundError, KrakenInputException, ValueError):
                 logger.warning(f'Invalid input file {doc}')
                 continue
@@ -191,12 +192,12 @@ def build_binary_dataset(files: Optional[List[Union[str, 'PathLike', 'Segmentati
     num_lines = 0
     for doc in docs:
         if format_type in ['xml', 'alto', 'page', None]:
-            lines = doc.lines.values()
+            lines = doc.lines
         elif format_type == 'path':
             lines = doc['lines']
         for line in lines:
             num_lines += 1
-            alphabet.update(line.text if format_type in ['xml', 'alto', 'page'] else line['text'])
+            alphabet.update(line.text if format_type in ['xml', 'alto', 'page', None] else line['text'])
 
     callback(0, num_lines)
 
diff --git a/tests/resources/170025120000003,0074-lite.xml b/tests/resources/170025120000003,0074-lite.xml
index 504794e0f..b87e85d4c 100644
--- a/tests/resources/170025120000003,0074-lite.xml
+++ b/tests/resources/170025120000003,0074-lite.xml
@@ -33,7 +33,7 @@
                 <Coords points="790,224 2398,259 2397,309 789,274"/>
                 <Baseline points="789,269 2397,304"/>
                 <TextEquiv>
-                    <Unicode>$-nor su hijo, De todos sus bienes, con los pactos</Unicode>
+                    <Unicode></Unicode>
                 </TextEquiv>
             </TextLine>
             <TextLine id="line_1469098653078_465" custom="readingOrder {index:1;} structure {type:$pac;}">
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 4ce06031a..31c3fb8ba 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -2,8 +2,10 @@
 
 import json
 import unittest
-from pathlib import Path
+import tempfile
+import pyarrow as pa
 
+from pathlib import Path
 from pytest import raises
 
 import kraken
@@ -13,23 +15,77 @@
 thisfile = Path(__file__).resolve().parent
 resources = thisfile / 'resources'
 
+def _validate_ds(self, path, num_lines, num_empty_lines, ds_type):
+    with pa.memory_map(path, 'rb') as source:
+        ds_table = pa.ipc.open_file(source).read_all()
+        raw_metadata = ds_table.schema.metadata
+        if not raw_metadata or b'lines' not in raw_metadata:
+            raise ValueError(f'{file} does not contain a valid metadata record.')
+        metadata = json.loads(raw_metadata[b'lines'])
+    self.assertEqual(metadata['type'],
+                    ds_type,
+                    f'Unexpected dataset type (expected: {ds_type}, found: {metadata["type"]}')
+    self.assertEqual(metadata['counts']['all'],
+                     num_lines,
+                     'Unexpected number of lines in dataset metadata '
+                     f'(expected: {num_lines}, found: {metadata["counts"]["all"]}')
+    self.assertEqual(len(ds_table),
+                     num_lines,
+                     'Unexpected number of rows in arrow table '
+                     f'(expected: {num_lines}, found: {metadata["counts"]["all"]}')
+
+    real_empty_lines = len([line for line in ds_table.column('lines') if not str(line[0])])
+    self.assertEqual(real_empty_lines,
+                     num_empty_lines,
+                     'Unexpected number of empty lines in dataset '
+                     f'(expected: {num_empty_lines}, found: {real_empty_lines}')
+
+
 class TestKrakenArrowCompilation(unittest.TestCase):
     """
     Tests for binary datasets
     """
     def setUp(self):
-        self.xml = resources / '170025120000003,0074.xml'
-        self.bls = xml.XMLPage(self.xml)
+        self.xml = resources / '170025120000003,0074-lite.xml'
+        self.seg = xml.XMLPage(self.xml).to_container()
         self.box_lines = [resources / '000236.png']
 
     def test_build_path_dataset(self):
-        pass
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            build_binary_dataset(files=4*self.box_lines,
+                                 output_file=tmp_file.name,
+                                 format_type='path')
+            _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_bbox')
 
     def test_build_xml_dataset(self):
-        pass
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            build_binary_dataset(files=[self.xml],
+                                 output_file=tmp_file.name,
+                                 format_type='xml')
+            _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline')
+
+    def test_build_seg_dataset(self):
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            build_binary_dataset(files=[self.seg],
+                                 output_file=tmp_file.name,
+                                 format_type=None)
+            _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline')
 
-    def test_build_obj_dataset(self):
-        pass
+    def test_forced_type_dataset(self):
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            build_binary_dataset(files=4*self.box_lines,
+                                 output_file=tmp_file.name,
+                                 format_type='path',
+                                 force_type='kraken_recognition_baseline')
+            _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline')
 
     def test_build_empty_dataset(self):
-        pass
+        """
+        Test that empty lines are retained in compiled dataset.
+        """
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            build_binary_dataset(files=[self.xml],
+                                 output_file=tmp_file.name,
+                                 format_type='xml',
+                                 skip_empty_lines=False)
+            _validate_ds(self, tmp_file.name, 5, 1, 'kraken_recognition_baseline')