From ac5e7b5214aefb0968715a7433ece52b749a3ae2 Mon Sep 17 00:00:00 2001 From: Benjamin Kiessling Date: Thu, 9 May 2024 16:49:35 +0200 Subject: [PATCH] 5.x dataset from object regression and some basic tests so this doesn't happen again --- kraken/lib/arrow_dataset.py | 9 +-- tests/resources/170025120000003,0074-lite.xml | 2 +- tests/test_arrow_dataset.py | 72 ++++++++++++++++--- 3 files changed, 70 insertions(+), 13 deletions(-) diff --git a/kraken/lib/arrow_dataset.py b/kraken/lib/arrow_dataset.py index 3d4b79a0f..bf3423266 100755 --- a/kraken/lib/arrow_dataset.py +++ b/kraken/lib/arrow_dataset.py @@ -52,8 +52,7 @@ def _extract_line(xml_record, skip_empty_lines: bool = True, legacy_polygons: bo return lines, None, None if is_bitonal(im): im = im.convert('1') - recs = xml_record.lines.values() - for idx, rec in enumerate(recs): + for idx, rec in enumerate(xml_record.lines): seg = Segmentation(text_direction='horizontal-lr', imagename=xml_record.imagename, type=xml_record.type, @@ -167,6 +166,8 @@ def build_binary_dataset(files: Optional[List[Union[str, 'PathLike', 'Segmentati for doc in files: try: data = parse_fn(doc) + if format_type in ['xml', 'alto', 'page']: + data = data.to_container() except (FileNotFoundError, KrakenInputException, ValueError): logger.warning(f'Invalid input file {doc}') continue @@ -191,12 +192,12 @@ def build_binary_dataset(files: Optional[List[Union[str, 'PathLike', 'Segmentati num_lines = 0 for doc in docs: if format_type in ['xml', 'alto', 'page', None]: - lines = doc.lines.values() + lines = doc.lines elif format_type == 'path': lines = doc['lines'] for line in lines: num_lines += 1 - alphabet.update(line.text if format_type in ['xml', 'alto', 'page'] else line['text']) + alphabet.update(line.text if format_type in ['xml', 'alto', 'page', None] else line['text']) callback(0, num_lines) diff --git a/tests/resources/170025120000003,0074-lite.xml b/tests/resources/170025120000003,0074-lite.xml index 504794e0f..b87e85d4c 100644 --- a/tests/resources/170025120000003,0074-lite.xml +++ b/tests/resources/170025120000003,0074-lite.xml @@ -33,7 +33,7 @@ - $-nor su hijo, De todos sus bienes, con los pactos + diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 4ce06031a..31c3fb8ba 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -2,8 +2,10 @@ import json import unittest -from pathlib import Path +import tempfile +import pyarrow as pa +from pathlib import Path from pytest import raises import kraken @@ -13,23 +15,77 @@ thisfile = Path(__file__).resolve().parent resources = thisfile / 'resources' +def _validate_ds(self, path, num_lines, num_empty_lines, ds_type): + with pa.memory_map(path, 'rb') as source: + ds_table = pa.ipc.open_file(source).read_all() + raw_metadata = ds_table.schema.metadata + if not raw_metadata or b'lines' not in raw_metadata: + raise ValueError(f'{file} does not contain a valid metadata record.') + metadata = json.loads(raw_metadata[b'lines']) + self.assertEqual(metadata['type'], + ds_type, + f'Unexpected dataset type (expected: {ds_type}, found: {metadata["type"]}') + self.assertEqual(metadata['counts']['all'], + num_lines, + 'Unexpected number of lines in dataset metadata ' + f'(expected: {num_lines}, found: {metadata["counts"]["all"]}') + self.assertEqual(len(ds_table), + num_lines, + 'Unexpected number of rows in arrow table ' + f'(expected: {num_lines}, found: {metadata["counts"]["all"]}') + + real_empty_lines = len([line for line in ds_table.column('lines') if not str(line[0])]) + self.assertEqual(real_empty_lines, + num_empty_lines, + 'Unexpected number of empty lines in dataset ' + f'(expected: {num_empty_lines}, found: {real_empty_lines}') + + class TestKrakenArrowCompilation(unittest.TestCase): """ Tests for binary datasets """ def setUp(self): - self.xml = resources / '170025120000003,0074.xml' - self.bls = xml.XMLPage(self.xml) + self.xml = resources / '170025120000003,0074-lite.xml' + self.seg = xml.XMLPage(self.xml).to_container() self.box_lines = [resources / '000236.png'] def test_build_path_dataset(self): - pass + with tempfile.NamedTemporaryFile() as tmp_file: + build_binary_dataset(files=4*self.box_lines, + output_file=tmp_file.name, + format_type='path') + _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_bbox') def test_build_xml_dataset(self): - pass + with tempfile.NamedTemporaryFile() as tmp_file: + build_binary_dataset(files=[self.xml], + output_file=tmp_file.name, + format_type='xml') + _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline') + + def test_build_seg_dataset(self): + with tempfile.NamedTemporaryFile() as tmp_file: + build_binary_dataset(files=[self.seg], + output_file=tmp_file.name, + format_type=None) + _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline') - def test_build_obj_dataset(self): - pass + def test_forced_type_dataset(self): + with tempfile.NamedTemporaryFile() as tmp_file: + build_binary_dataset(files=4*self.box_lines, + output_file=tmp_file.name, + format_type='path', + force_type='kraken_recognition_baseline') + _validate_ds(self, tmp_file.name, 4, 0, 'kraken_recognition_baseline') def test_build_empty_dataset(self): - pass + """ + Test that empty lines are retained in compiled dataset. + """ + with tempfile.NamedTemporaryFile() as tmp_file: + build_binary_dataset(files=[self.xml], + output_file=tmp_file.name, + format_type='xml', + skip_empty_lines=False) + _validate_ds(self, tmp_file.name, 5, 1, 'kraken_recognition_baseline')